1JV45FSDP / trainer_state.json
gotzmann's picture
..
0685185
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1537,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006506180871828237,
"grad_norm": 3.778571605682373,
"learning_rate": 0.0001,
"loss": 4.706,
"step": 1
},
{
"epoch": 0.0013012361743656475,
"grad_norm": 0.7331739068031311,
"learning_rate": 0.0001,
"loss": 2.6402,
"step": 2
},
{
"epoch": 0.001951854261548471,
"grad_norm": 0.5679969191551208,
"learning_rate": 0.0001,
"loss": 2.5315,
"step": 3
},
{
"epoch": 0.002602472348731295,
"grad_norm": 0.6543067693710327,
"learning_rate": 0.0001,
"loss": 2.5226,
"step": 4
},
{
"epoch": 0.0032530904359141183,
"grad_norm": 0.42487671971321106,
"learning_rate": 0.0001,
"loss": 2.1375,
"step": 5
},
{
"epoch": 0.003903708523096942,
"grad_norm": 0.48795655369758606,
"learning_rate": 0.0001,
"loss": 2.253,
"step": 6
},
{
"epoch": 0.004554326610279766,
"grad_norm": 0.6054234504699707,
"learning_rate": 0.0001,
"loss": 2.3411,
"step": 7
},
{
"epoch": 0.00520494469746259,
"grad_norm": 0.3039970397949219,
"learning_rate": 0.0001,
"loss": 2.1293,
"step": 8
},
{
"epoch": 0.005855562784645413,
"grad_norm": 0.6592361330986023,
"learning_rate": 0.0001,
"loss": 3.1615,
"step": 9
},
{
"epoch": 0.006506180871828237,
"grad_norm": 0.4017999470233917,
"learning_rate": 0.0001,
"loss": 2.5068,
"step": 10
},
{
"epoch": 0.0071567989590110605,
"grad_norm": 0.31507641077041626,
"learning_rate": 0.0001,
"loss": 2.1894,
"step": 11
},
{
"epoch": 0.007807417046193884,
"grad_norm": 0.33226895332336426,
"learning_rate": 0.0001,
"loss": 2.2006,
"step": 12
},
{
"epoch": 0.008458035133376708,
"grad_norm": 0.2632739841938019,
"learning_rate": 0.0001,
"loss": 2.0998,
"step": 13
},
{
"epoch": 0.009108653220559532,
"grad_norm": 0.2794795036315918,
"learning_rate": 0.0001,
"loss": 2.113,
"step": 14
},
{
"epoch": 0.009759271307742356,
"grad_norm": 0.29168492555618286,
"learning_rate": 0.0001,
"loss": 2.354,
"step": 15
},
{
"epoch": 0.01040988939492518,
"grad_norm": 0.2537970244884491,
"learning_rate": 0.0001,
"loss": 2.2939,
"step": 16
},
{
"epoch": 0.011060507482108002,
"grad_norm": 0.5140053033828735,
"learning_rate": 0.0001,
"loss": 2.6237,
"step": 17
},
{
"epoch": 0.011711125569290826,
"grad_norm": 0.3093675971031189,
"learning_rate": 0.0001,
"loss": 2.3502,
"step": 18
},
{
"epoch": 0.01236174365647365,
"grad_norm": 0.29241421818733215,
"learning_rate": 0.0001,
"loss": 2.5365,
"step": 19
},
{
"epoch": 0.013012361743656473,
"grad_norm": 0.3164322078227997,
"learning_rate": 0.0001,
"loss": 2.396,
"step": 20
},
{
"epoch": 0.013662979830839297,
"grad_norm": 0.24512743949890137,
"learning_rate": 0.0001,
"loss": 2.2759,
"step": 21
},
{
"epoch": 0.014313597918022121,
"grad_norm": 0.24328342080116272,
"learning_rate": 0.0001,
"loss": 2.2103,
"step": 22
},
{
"epoch": 0.014964216005204945,
"grad_norm": 0.2563220262527466,
"learning_rate": 0.0001,
"loss": 2.4836,
"step": 23
},
{
"epoch": 0.015614834092387769,
"grad_norm": 0.33601588010787964,
"learning_rate": 0.0001,
"loss": 2.4446,
"step": 24
},
{
"epoch": 0.01626545217957059,
"grad_norm": 0.28699007630348206,
"learning_rate": 0.0001,
"loss": 2.8504,
"step": 25
},
{
"epoch": 0.016916070266753416,
"grad_norm": 0.3181653618812561,
"learning_rate": 0.0001,
"loss": 2.3042,
"step": 26
},
{
"epoch": 0.01756668835393624,
"grad_norm": 0.2349390834569931,
"learning_rate": 0.0001,
"loss": 2.1024,
"step": 27
},
{
"epoch": 0.018217306441119064,
"grad_norm": 0.2751820981502533,
"learning_rate": 0.0001,
"loss": 2.2646,
"step": 28
},
{
"epoch": 0.018867924528301886,
"grad_norm": 0.25547271966934204,
"learning_rate": 0.0001,
"loss": 2.1928,
"step": 29
},
{
"epoch": 0.01951854261548471,
"grad_norm": 0.283507764339447,
"learning_rate": 0.0001,
"loss": 2.3073,
"step": 30
},
{
"epoch": 0.020169160702667534,
"grad_norm": 0.3354213237762451,
"learning_rate": 0.0001,
"loss": 2.6273,
"step": 31
},
{
"epoch": 0.02081977878985036,
"grad_norm": 0.40484553575515747,
"learning_rate": 0.0001,
"loss": 2.4919,
"step": 32
},
{
"epoch": 0.02147039687703318,
"grad_norm": 0.34319421648979187,
"learning_rate": 0.0001,
"loss": 2.8381,
"step": 33
},
{
"epoch": 0.022121014964216004,
"grad_norm": 0.32958984375,
"learning_rate": 0.0001,
"loss": 2.3062,
"step": 34
},
{
"epoch": 0.02277163305139883,
"grad_norm": 0.4503105878829956,
"learning_rate": 0.0001,
"loss": 2.4647,
"step": 35
},
{
"epoch": 0.02342225113858165,
"grad_norm": 0.5084238052368164,
"learning_rate": 0.0001,
"loss": 3.0047,
"step": 36
},
{
"epoch": 0.024072869225764477,
"grad_norm": 0.5192400813102722,
"learning_rate": 0.0001,
"loss": 2.2899,
"step": 37
},
{
"epoch": 0.0247234873129473,
"grad_norm": 0.4197874665260315,
"learning_rate": 0.0001,
"loss": 2.4057,
"step": 38
},
{
"epoch": 0.025374105400130124,
"grad_norm": 0.5170285105705261,
"learning_rate": 0.0001,
"loss": 3.2918,
"step": 39
},
{
"epoch": 0.026024723487312947,
"grad_norm": 0.2491147667169571,
"learning_rate": 0.0001,
"loss": 2.1957,
"step": 40
},
{
"epoch": 0.026675341574495772,
"grad_norm": 0.6597635746002197,
"learning_rate": 0.0001,
"loss": 2.7474,
"step": 41
},
{
"epoch": 0.027325959661678594,
"grad_norm": 0.40205034613609314,
"learning_rate": 0.0001,
"loss": 2.4561,
"step": 42
},
{
"epoch": 0.02797657774886142,
"grad_norm": 0.27388331294059753,
"learning_rate": 0.0001,
"loss": 2.0477,
"step": 43
},
{
"epoch": 0.028627195836044242,
"grad_norm": 0.9163908958435059,
"learning_rate": 0.0001,
"loss": 3.334,
"step": 44
},
{
"epoch": 0.029277813923227064,
"grad_norm": 0.2747696042060852,
"learning_rate": 0.0001,
"loss": 2.1604,
"step": 45
},
{
"epoch": 0.02992843201040989,
"grad_norm": 0.36308085918426514,
"learning_rate": 0.0001,
"loss": 2.693,
"step": 46
},
{
"epoch": 0.03057905009759271,
"grad_norm": 0.6159886121749878,
"learning_rate": 0.0001,
"loss": 2.5515,
"step": 47
},
{
"epoch": 0.031229668184775537,
"grad_norm": 0.4801373779773712,
"learning_rate": 0.0001,
"loss": 2.809,
"step": 48
},
{
"epoch": 0.03188028627195836,
"grad_norm": 0.32580915093421936,
"learning_rate": 0.0001,
"loss": 2.5236,
"step": 49
},
{
"epoch": 0.03253090435914118,
"grad_norm": 0.3028671443462372,
"learning_rate": 0.0001,
"loss": 2.2685,
"step": 50
},
{
"epoch": 0.03318152244632401,
"grad_norm": 0.5660931468009949,
"learning_rate": 0.0001,
"loss": 2.2564,
"step": 51
},
{
"epoch": 0.03383214053350683,
"grad_norm": 0.24634602665901184,
"learning_rate": 0.0001,
"loss": 2.1355,
"step": 52
},
{
"epoch": 0.034482758620689655,
"grad_norm": 0.24830913543701172,
"learning_rate": 0.0001,
"loss": 2.0425,
"step": 53
},
{
"epoch": 0.03513337670787248,
"grad_norm": 0.23614570498466492,
"learning_rate": 0.0001,
"loss": 2.1975,
"step": 54
},
{
"epoch": 0.035783994795055306,
"grad_norm": 0.2624325156211853,
"learning_rate": 0.0001,
"loss": 2.3071,
"step": 55
},
{
"epoch": 0.03643461288223813,
"grad_norm": 0.3967755436897278,
"learning_rate": 0.0001,
"loss": 2.6088,
"step": 56
},
{
"epoch": 0.03708523096942095,
"grad_norm": 0.22147373855113983,
"learning_rate": 0.0001,
"loss": 2.003,
"step": 57
},
{
"epoch": 0.03773584905660377,
"grad_norm": 0.47795867919921875,
"learning_rate": 0.0001,
"loss": 2.1473,
"step": 58
},
{
"epoch": 0.038386467143786594,
"grad_norm": 0.43953707814216614,
"learning_rate": 0.0001,
"loss": 2.6595,
"step": 59
},
{
"epoch": 0.03903708523096942,
"grad_norm": 0.29031845927238464,
"learning_rate": 0.0001,
"loss": 2.3173,
"step": 60
},
{
"epoch": 0.039687703318152245,
"grad_norm": 0.2491024285554886,
"learning_rate": 0.0001,
"loss": 2.0575,
"step": 61
},
{
"epoch": 0.04033832140533507,
"grad_norm": 0.3025687634944916,
"learning_rate": 0.0001,
"loss": 2.0965,
"step": 62
},
{
"epoch": 0.04098893949251789,
"grad_norm": 0.26097819209098816,
"learning_rate": 0.0001,
"loss": 2.2583,
"step": 63
},
{
"epoch": 0.04163955757970072,
"grad_norm": 0.2413238286972046,
"learning_rate": 0.0001,
"loss": 2.2441,
"step": 64
},
{
"epoch": 0.04229017566688354,
"grad_norm": 0.2332315295934677,
"learning_rate": 0.0001,
"loss": 2.185,
"step": 65
},
{
"epoch": 0.04294079375406636,
"grad_norm": 0.4037252366542816,
"learning_rate": 0.0001,
"loss": 2.3875,
"step": 66
},
{
"epoch": 0.043591411841249185,
"grad_norm": 0.34149354696273804,
"learning_rate": 0.0001,
"loss": 2.3835,
"step": 67
},
{
"epoch": 0.04424202992843201,
"grad_norm": 0.23793481290340424,
"learning_rate": 0.0001,
"loss": 2.3521,
"step": 68
},
{
"epoch": 0.044892648015614836,
"grad_norm": 0.24252744019031525,
"learning_rate": 0.0001,
"loss": 2.0984,
"step": 69
},
{
"epoch": 0.04554326610279766,
"grad_norm": 0.2870447635650635,
"learning_rate": 0.0001,
"loss": 2.5408,
"step": 70
},
{
"epoch": 0.04619388418998048,
"grad_norm": 0.5050077438354492,
"learning_rate": 0.0001,
"loss": 2.7091,
"step": 71
},
{
"epoch": 0.0468445022771633,
"grad_norm": 0.2391565591096878,
"learning_rate": 0.0001,
"loss": 2.1601,
"step": 72
},
{
"epoch": 0.04749512036434613,
"grad_norm": 0.20647507905960083,
"learning_rate": 0.0001,
"loss": 1.9582,
"step": 73
},
{
"epoch": 0.048145738451528954,
"grad_norm": 0.26072338223457336,
"learning_rate": 0.0001,
"loss": 2.3577,
"step": 74
},
{
"epoch": 0.048796356538711776,
"grad_norm": 0.28378504514694214,
"learning_rate": 0.0001,
"loss": 2.349,
"step": 75
},
{
"epoch": 0.0494469746258946,
"grad_norm": 0.2536943256855011,
"learning_rate": 0.0001,
"loss": 2.375,
"step": 76
},
{
"epoch": 0.05009759271307743,
"grad_norm": 0.29276445508003235,
"learning_rate": 0.0001,
"loss": 2.5003,
"step": 77
},
{
"epoch": 0.05074821080026025,
"grad_norm": 0.2649310231208801,
"learning_rate": 0.0001,
"loss": 2.3247,
"step": 78
},
{
"epoch": 0.05139882888744307,
"grad_norm": 0.38125383853912354,
"learning_rate": 0.0001,
"loss": 2.5405,
"step": 79
},
{
"epoch": 0.05204944697462589,
"grad_norm": 0.40980008244514465,
"learning_rate": 0.0001,
"loss": 2.212,
"step": 80
},
{
"epoch": 0.052700065061808715,
"grad_norm": 0.5363492965698242,
"learning_rate": 0.0001,
"loss": 2.6499,
"step": 81
},
{
"epoch": 0.053350683148991544,
"grad_norm": 0.34647300839424133,
"learning_rate": 0.0001,
"loss": 2.6302,
"step": 82
},
{
"epoch": 0.054001301236174366,
"grad_norm": 0.27607980370521545,
"learning_rate": 0.0001,
"loss": 2.1819,
"step": 83
},
{
"epoch": 0.05465191932335719,
"grad_norm": 0.27654680609703064,
"learning_rate": 0.0001,
"loss": 2.1763,
"step": 84
},
{
"epoch": 0.05530253741054001,
"grad_norm": 0.24596217274665833,
"learning_rate": 0.0001,
"loss": 2.2585,
"step": 85
},
{
"epoch": 0.05595315549772284,
"grad_norm": 0.24279890954494476,
"learning_rate": 0.0001,
"loss": 2.4247,
"step": 86
},
{
"epoch": 0.05660377358490566,
"grad_norm": 0.2918747365474701,
"learning_rate": 0.0001,
"loss": 2.3986,
"step": 87
},
{
"epoch": 0.057254391672088484,
"grad_norm": 0.26778745651245117,
"learning_rate": 0.0001,
"loss": 2.3592,
"step": 88
},
{
"epoch": 0.057905009759271306,
"grad_norm": 0.39637815952301025,
"learning_rate": 0.0001,
"loss": 2.8006,
"step": 89
},
{
"epoch": 0.05855562784645413,
"grad_norm": 0.2676962614059448,
"learning_rate": 0.0001,
"loss": 2.2384,
"step": 90
},
{
"epoch": 0.05920624593363696,
"grad_norm": 0.3044937252998352,
"learning_rate": 0.0001,
"loss": 2.7762,
"step": 91
},
{
"epoch": 0.05985686402081978,
"grad_norm": 0.23922136425971985,
"learning_rate": 0.0001,
"loss": 2.0873,
"step": 92
},
{
"epoch": 0.0605074821080026,
"grad_norm": 0.25385046005249023,
"learning_rate": 0.0001,
"loss": 2.2708,
"step": 93
},
{
"epoch": 0.06115810019518542,
"grad_norm": 0.378401517868042,
"learning_rate": 0.0001,
"loss": 3.0583,
"step": 94
},
{
"epoch": 0.06180871828236825,
"grad_norm": 0.37193092703819275,
"learning_rate": 0.0001,
"loss": 2.3632,
"step": 95
},
{
"epoch": 0.062459336369551074,
"grad_norm": 0.3757643699645996,
"learning_rate": 0.0001,
"loss": 2.4071,
"step": 96
},
{
"epoch": 0.0631099544567339,
"grad_norm": 0.272833913564682,
"learning_rate": 0.0001,
"loss": 2.3989,
"step": 97
},
{
"epoch": 0.06376057254391672,
"grad_norm": 0.26533326506614685,
"learning_rate": 0.0001,
"loss": 2.1716,
"step": 98
},
{
"epoch": 0.06441119063109954,
"grad_norm": 0.5787199139595032,
"learning_rate": 0.0001,
"loss": 2.9445,
"step": 99
},
{
"epoch": 0.06506180871828236,
"grad_norm": 0.29046157002449036,
"learning_rate": 0.0001,
"loss": 2.3325,
"step": 100
},
{
"epoch": 0.06571242680546518,
"grad_norm": 0.531452476978302,
"learning_rate": 0.0001,
"loss": 2.7445,
"step": 101
},
{
"epoch": 0.06636304489264802,
"grad_norm": 0.3969165086746216,
"learning_rate": 0.0001,
"loss": 2.7126,
"step": 102
},
{
"epoch": 0.06701366297983084,
"grad_norm": 0.24183356761932373,
"learning_rate": 0.0001,
"loss": 1.9971,
"step": 103
},
{
"epoch": 0.06766428106701367,
"grad_norm": 0.3268399238586426,
"learning_rate": 0.0001,
"loss": 2.1055,
"step": 104
},
{
"epoch": 0.06831489915419649,
"grad_norm": 0.2625877559185028,
"learning_rate": 0.0001,
"loss": 1.9946,
"step": 105
},
{
"epoch": 0.06896551724137931,
"grad_norm": 0.2720443308353424,
"learning_rate": 0.0001,
"loss": 2.0764,
"step": 106
},
{
"epoch": 0.06961613532856213,
"grad_norm": 0.20969334244728088,
"learning_rate": 0.0001,
"loss": 1.8687,
"step": 107
},
{
"epoch": 0.07026675341574495,
"grad_norm": 0.26211223006248474,
"learning_rate": 0.0001,
"loss": 2.2042,
"step": 108
},
{
"epoch": 0.07091737150292778,
"grad_norm": 0.27889683842658997,
"learning_rate": 0.0001,
"loss": 2.3146,
"step": 109
},
{
"epoch": 0.07156798959011061,
"grad_norm": 0.2657179832458496,
"learning_rate": 0.0001,
"loss": 2.1021,
"step": 110
},
{
"epoch": 0.07221860767729343,
"grad_norm": 0.26620885729789734,
"learning_rate": 0.0001,
"loss": 2.3488,
"step": 111
},
{
"epoch": 0.07286922576447626,
"grad_norm": 0.4223373830318451,
"learning_rate": 0.0001,
"loss": 2.5289,
"step": 112
},
{
"epoch": 0.07351984385165908,
"grad_norm": 0.35398781299591064,
"learning_rate": 0.0001,
"loss": 2.5702,
"step": 113
},
{
"epoch": 0.0741704619388419,
"grad_norm": 0.23328129947185516,
"learning_rate": 0.0001,
"loss": 2.1292,
"step": 114
},
{
"epoch": 0.07482108002602472,
"grad_norm": 0.33508536219596863,
"learning_rate": 0.0001,
"loss": 2.2049,
"step": 115
},
{
"epoch": 0.07547169811320754,
"grad_norm": 0.2646953761577606,
"learning_rate": 0.0001,
"loss": 2.3445,
"step": 116
},
{
"epoch": 0.07612231620039037,
"grad_norm": 0.27866706252098083,
"learning_rate": 0.0001,
"loss": 2.2472,
"step": 117
},
{
"epoch": 0.07677293428757319,
"grad_norm": 0.35688602924346924,
"learning_rate": 0.0001,
"loss": 2.5045,
"step": 118
},
{
"epoch": 0.07742355237475602,
"grad_norm": 0.24262933433055878,
"learning_rate": 0.0001,
"loss": 2.4565,
"step": 119
},
{
"epoch": 0.07807417046193885,
"grad_norm": 0.44757333397865295,
"learning_rate": 0.0001,
"loss": 2.1619,
"step": 120
},
{
"epoch": 0.07872478854912167,
"grad_norm": 0.3279111385345459,
"learning_rate": 0.0001,
"loss": 2.3996,
"step": 121
},
{
"epoch": 0.07937540663630449,
"grad_norm": 0.25862693786621094,
"learning_rate": 0.0001,
"loss": 2.3214,
"step": 122
},
{
"epoch": 0.08002602472348731,
"grad_norm": 0.30093592405319214,
"learning_rate": 0.0001,
"loss": 2.6446,
"step": 123
},
{
"epoch": 0.08067664281067013,
"grad_norm": 0.25440871715545654,
"learning_rate": 0.0001,
"loss": 2.1181,
"step": 124
},
{
"epoch": 0.08132726089785296,
"grad_norm": 0.19935627281665802,
"learning_rate": 0.0001,
"loss": 2.0904,
"step": 125
},
{
"epoch": 0.08197787898503578,
"grad_norm": 0.27385473251342773,
"learning_rate": 0.0001,
"loss": 2.0829,
"step": 126
},
{
"epoch": 0.0826284970722186,
"grad_norm": 0.24417711794376373,
"learning_rate": 0.0001,
"loss": 2.0019,
"step": 127
},
{
"epoch": 0.08327911515940144,
"grad_norm": 0.27386653423309326,
"learning_rate": 0.0001,
"loss": 2.2743,
"step": 128
},
{
"epoch": 0.08392973324658426,
"grad_norm": 0.22413575649261475,
"learning_rate": 0.0001,
"loss": 2.1584,
"step": 129
},
{
"epoch": 0.08458035133376708,
"grad_norm": 0.27748343348503113,
"learning_rate": 0.0001,
"loss": 2.1428,
"step": 130
},
{
"epoch": 0.0852309694209499,
"grad_norm": 0.18890976905822754,
"learning_rate": 0.0001,
"loss": 1.9474,
"step": 131
},
{
"epoch": 0.08588158750813273,
"grad_norm": 0.3067719340324402,
"learning_rate": 0.0001,
"loss": 2.287,
"step": 132
},
{
"epoch": 0.08653220559531555,
"grad_norm": 0.35126858949661255,
"learning_rate": 0.0001,
"loss": 2.5086,
"step": 133
},
{
"epoch": 0.08718282368249837,
"grad_norm": 0.19619591534137726,
"learning_rate": 0.0001,
"loss": 2.0132,
"step": 134
},
{
"epoch": 0.08783344176968119,
"grad_norm": 0.360569566488266,
"learning_rate": 0.0001,
"loss": 2.607,
"step": 135
},
{
"epoch": 0.08848405985686401,
"grad_norm": 0.22566738724708557,
"learning_rate": 0.0001,
"loss": 2.0942,
"step": 136
},
{
"epoch": 0.08913467794404685,
"grad_norm": 0.27346086502075195,
"learning_rate": 0.0001,
"loss": 2.3139,
"step": 137
},
{
"epoch": 0.08978529603122967,
"grad_norm": 0.2500152289867401,
"learning_rate": 0.0001,
"loss": 2.0815,
"step": 138
},
{
"epoch": 0.0904359141184125,
"grad_norm": 0.22101153433322906,
"learning_rate": 0.0001,
"loss": 2.374,
"step": 139
},
{
"epoch": 0.09108653220559532,
"grad_norm": 0.2173723727464676,
"learning_rate": 0.0001,
"loss": 2.0084,
"step": 140
},
{
"epoch": 0.09173715029277814,
"grad_norm": 0.28956499695777893,
"learning_rate": 0.0001,
"loss": 2.6283,
"step": 141
},
{
"epoch": 0.09238776837996096,
"grad_norm": 0.27032795548439026,
"learning_rate": 0.0001,
"loss": 2.142,
"step": 142
},
{
"epoch": 0.09303838646714378,
"grad_norm": 0.24320480227470398,
"learning_rate": 0.0001,
"loss": 2.1402,
"step": 143
},
{
"epoch": 0.0936890045543266,
"grad_norm": 0.3127799332141876,
"learning_rate": 0.0001,
"loss": 2.6671,
"step": 144
},
{
"epoch": 0.09433962264150944,
"grad_norm": 0.30706024169921875,
"learning_rate": 0.0001,
"loss": 2.3026,
"step": 145
},
{
"epoch": 0.09499024072869226,
"grad_norm": 0.2378646731376648,
"learning_rate": 0.0001,
"loss": 2.0422,
"step": 146
},
{
"epoch": 0.09564085881587508,
"grad_norm": 0.24755406379699707,
"learning_rate": 0.0001,
"loss": 2.2574,
"step": 147
},
{
"epoch": 0.09629147690305791,
"grad_norm": 0.34464696049690247,
"learning_rate": 0.0001,
"loss": 2.2817,
"step": 148
},
{
"epoch": 0.09694209499024073,
"grad_norm": 0.30485469102859497,
"learning_rate": 0.0001,
"loss": 2.7303,
"step": 149
},
{
"epoch": 0.09759271307742355,
"grad_norm": 0.1860698163509369,
"learning_rate": 0.0001,
"loss": 1.8582,
"step": 150
},
{
"epoch": 0.09824333116460637,
"grad_norm": 0.23853841423988342,
"learning_rate": 0.0001,
"loss": 2.1378,
"step": 151
},
{
"epoch": 0.0988939492517892,
"grad_norm": 0.20248261094093323,
"learning_rate": 0.0001,
"loss": 2.1888,
"step": 152
},
{
"epoch": 0.09954456733897202,
"grad_norm": 0.3582792282104492,
"learning_rate": 0.0001,
"loss": 2.6726,
"step": 153
},
{
"epoch": 0.10019518542615485,
"grad_norm": 0.2576686441898346,
"learning_rate": 0.0001,
"loss": 2.4494,
"step": 154
},
{
"epoch": 0.10084580351333768,
"grad_norm": 0.306029349565506,
"learning_rate": 0.0001,
"loss": 2.2273,
"step": 155
},
{
"epoch": 0.1014964216005205,
"grad_norm": 0.31375500559806824,
"learning_rate": 0.0001,
"loss": 2.2474,
"step": 156
},
{
"epoch": 0.10214703968770332,
"grad_norm": 0.253250390291214,
"learning_rate": 0.0001,
"loss": 2.0142,
"step": 157
},
{
"epoch": 0.10279765777488614,
"grad_norm": 0.3098273277282715,
"learning_rate": 0.0001,
"loss": 2.2516,
"step": 158
},
{
"epoch": 0.10344827586206896,
"grad_norm": 0.3239591717720032,
"learning_rate": 0.0001,
"loss": 2.2432,
"step": 159
},
{
"epoch": 0.10409889394925179,
"grad_norm": 0.24929773807525635,
"learning_rate": 0.0001,
"loss": 2.2495,
"step": 160
},
{
"epoch": 0.10474951203643461,
"grad_norm": 0.3203783929347992,
"learning_rate": 0.0001,
"loss": 2.68,
"step": 161
},
{
"epoch": 0.10540013012361743,
"grad_norm": 0.38844674825668335,
"learning_rate": 0.0001,
"loss": 2.7457,
"step": 162
},
{
"epoch": 0.10605074821080027,
"grad_norm": 0.21753644943237305,
"learning_rate": 0.0001,
"loss": 2.1284,
"step": 163
},
{
"epoch": 0.10670136629798309,
"grad_norm": 0.20610418915748596,
"learning_rate": 0.0001,
"loss": 1.8377,
"step": 164
},
{
"epoch": 0.10735198438516591,
"grad_norm": 0.3555772304534912,
"learning_rate": 0.0001,
"loss": 2.3599,
"step": 165
},
{
"epoch": 0.10800260247234873,
"grad_norm": 0.3971005380153656,
"learning_rate": 0.0001,
"loss": 2.2771,
"step": 166
},
{
"epoch": 0.10865322055953155,
"grad_norm": 0.28628769516944885,
"learning_rate": 0.0001,
"loss": 2.2438,
"step": 167
},
{
"epoch": 0.10930383864671438,
"grad_norm": 0.38728833198547363,
"learning_rate": 0.0001,
"loss": 2.4103,
"step": 168
},
{
"epoch": 0.1099544567338972,
"grad_norm": 0.26340189576148987,
"learning_rate": 0.0001,
"loss": 2.6832,
"step": 169
},
{
"epoch": 0.11060507482108002,
"grad_norm": 0.20119386911392212,
"learning_rate": 0.0001,
"loss": 1.9622,
"step": 170
},
{
"epoch": 0.11125569290826284,
"grad_norm": 0.2929171621799469,
"learning_rate": 0.0001,
"loss": 2.2762,
"step": 171
},
{
"epoch": 0.11190631099544568,
"grad_norm": 0.422146201133728,
"learning_rate": 0.0001,
"loss": 2.4015,
"step": 172
},
{
"epoch": 0.1125569290826285,
"grad_norm": 0.29050537943840027,
"learning_rate": 0.0001,
"loss": 2.4399,
"step": 173
},
{
"epoch": 0.11320754716981132,
"grad_norm": 0.2646816074848175,
"learning_rate": 0.0001,
"loss": 2.3058,
"step": 174
},
{
"epoch": 0.11385816525699415,
"grad_norm": 0.2643061578273773,
"learning_rate": 0.0001,
"loss": 2.1892,
"step": 175
},
{
"epoch": 0.11450878334417697,
"grad_norm": 0.5878323316574097,
"learning_rate": 0.0001,
"loss": 3.2198,
"step": 176
},
{
"epoch": 0.11515940143135979,
"grad_norm": 0.36881884932518005,
"learning_rate": 0.0001,
"loss": 2.4112,
"step": 177
},
{
"epoch": 0.11581001951854261,
"grad_norm": 0.25198304653167725,
"learning_rate": 0.0001,
"loss": 2.1667,
"step": 178
},
{
"epoch": 0.11646063760572543,
"grad_norm": 0.34164664149284363,
"learning_rate": 0.0001,
"loss": 2.6248,
"step": 179
},
{
"epoch": 0.11711125569290826,
"grad_norm": 0.41471973061561584,
"learning_rate": 0.0001,
"loss": 2.5616,
"step": 180
},
{
"epoch": 0.11776187378009109,
"grad_norm": 0.26372480392456055,
"learning_rate": 0.0001,
"loss": 2.2904,
"step": 181
},
{
"epoch": 0.11841249186727391,
"grad_norm": 0.2271176278591156,
"learning_rate": 0.0001,
"loss": 2.0312,
"step": 182
},
{
"epoch": 0.11906310995445674,
"grad_norm": 0.2106996774673462,
"learning_rate": 0.0001,
"loss": 1.9661,
"step": 183
},
{
"epoch": 0.11971372804163956,
"grad_norm": 0.22870291769504547,
"learning_rate": 0.0001,
"loss": 1.9052,
"step": 184
},
{
"epoch": 0.12036434612882238,
"grad_norm": 0.41253864765167236,
"learning_rate": 0.0001,
"loss": 2.3747,
"step": 185
},
{
"epoch": 0.1210149642160052,
"grad_norm": 0.3258817791938782,
"learning_rate": 0.0001,
"loss": 2.5401,
"step": 186
},
{
"epoch": 0.12166558230318802,
"grad_norm": 0.3461870551109314,
"learning_rate": 0.0001,
"loss": 2.8027,
"step": 187
},
{
"epoch": 0.12231620039037085,
"grad_norm": 0.3704046607017517,
"learning_rate": 0.0001,
"loss": 2.799,
"step": 188
},
{
"epoch": 0.12296681847755368,
"grad_norm": 0.30265969038009644,
"learning_rate": 0.0001,
"loss": 2.4287,
"step": 189
},
{
"epoch": 0.1236174365647365,
"grad_norm": 0.4215582013130188,
"learning_rate": 0.0001,
"loss": 2.6857,
"step": 190
},
{
"epoch": 0.12426805465191933,
"grad_norm": 0.3003520965576172,
"learning_rate": 0.0001,
"loss": 2.4155,
"step": 191
},
{
"epoch": 0.12491867273910215,
"grad_norm": 0.412749320268631,
"learning_rate": 0.0001,
"loss": 2.6352,
"step": 192
},
{
"epoch": 0.12556929082628496,
"grad_norm": 0.2772350013256073,
"learning_rate": 0.0001,
"loss": 2.2452,
"step": 193
},
{
"epoch": 0.1262199089134678,
"grad_norm": 0.21457143127918243,
"learning_rate": 0.0001,
"loss": 2.0172,
"step": 194
},
{
"epoch": 0.12687052700065063,
"grad_norm": 0.40995845198631287,
"learning_rate": 0.0001,
"loss": 2.6218,
"step": 195
},
{
"epoch": 0.12752114508783344,
"grad_norm": 0.2253209501504898,
"learning_rate": 0.0001,
"loss": 2.2319,
"step": 196
},
{
"epoch": 0.12817176317501627,
"grad_norm": 0.36564287543296814,
"learning_rate": 0.0001,
"loss": 2.4585,
"step": 197
},
{
"epoch": 0.12882238126219908,
"grad_norm": 0.41084784269332886,
"learning_rate": 0.0001,
"loss": 2.6326,
"step": 198
},
{
"epoch": 0.12947299934938192,
"grad_norm": 0.36012157797813416,
"learning_rate": 0.0001,
"loss": 2.0168,
"step": 199
},
{
"epoch": 0.13012361743656473,
"grad_norm": 0.5138425230979919,
"learning_rate": 0.0001,
"loss": 2.3377,
"step": 200
},
{
"epoch": 0.13077423552374756,
"grad_norm": 0.2799031436443329,
"learning_rate": 0.0001,
"loss": 2.532,
"step": 201
},
{
"epoch": 0.13142485361093037,
"grad_norm": 0.3078779876232147,
"learning_rate": 0.0001,
"loss": 2.044,
"step": 202
},
{
"epoch": 0.1320754716981132,
"grad_norm": 0.31270912289619446,
"learning_rate": 0.0001,
"loss": 1.8576,
"step": 203
},
{
"epoch": 0.13272608978529604,
"grad_norm": 0.23117204010486603,
"learning_rate": 0.0001,
"loss": 2.1908,
"step": 204
},
{
"epoch": 0.13337670787247885,
"grad_norm": 0.2531285285949707,
"learning_rate": 0.0001,
"loss": 2.143,
"step": 205
},
{
"epoch": 0.1340273259596617,
"grad_norm": 0.28053218126296997,
"learning_rate": 0.0001,
"loss": 2.6902,
"step": 206
},
{
"epoch": 0.1346779440468445,
"grad_norm": 0.2600589692592621,
"learning_rate": 0.0001,
"loss": 2.0355,
"step": 207
},
{
"epoch": 0.13532856213402733,
"grad_norm": 0.2725912630558014,
"learning_rate": 0.0001,
"loss": 2.3949,
"step": 208
},
{
"epoch": 0.13597918022121014,
"grad_norm": 0.6166338324546814,
"learning_rate": 0.0001,
"loss": 2.8146,
"step": 209
},
{
"epoch": 0.13662979830839297,
"grad_norm": 0.4028575122356415,
"learning_rate": 0.0001,
"loss": 2.888,
"step": 210
},
{
"epoch": 0.1372804163955758,
"grad_norm": 0.23181548714637756,
"learning_rate": 0.0001,
"loss": 2.1406,
"step": 211
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.24338063597679138,
"learning_rate": 0.0001,
"loss": 2.1564,
"step": 212
},
{
"epoch": 0.13858165256994145,
"grad_norm": 0.233146533370018,
"learning_rate": 0.0001,
"loss": 2.1695,
"step": 213
},
{
"epoch": 0.13923227065712426,
"grad_norm": 0.21236726641654968,
"learning_rate": 0.0001,
"loss": 1.9272,
"step": 214
},
{
"epoch": 0.1398828887443071,
"grad_norm": 0.25471317768096924,
"learning_rate": 0.0001,
"loss": 2.3447,
"step": 215
},
{
"epoch": 0.1405335068314899,
"grad_norm": 0.35532835125923157,
"learning_rate": 0.0001,
"loss": 2.4328,
"step": 216
},
{
"epoch": 0.14118412491867274,
"grad_norm": 0.32900944352149963,
"learning_rate": 0.0001,
"loss": 2.385,
"step": 217
},
{
"epoch": 0.14183474300585555,
"grad_norm": 0.45404863357543945,
"learning_rate": 0.0001,
"loss": 2.8053,
"step": 218
},
{
"epoch": 0.1424853610930384,
"grad_norm": 0.33968400955200195,
"learning_rate": 0.0001,
"loss": 2.4524,
"step": 219
},
{
"epoch": 0.14313597918022122,
"grad_norm": 0.3250170946121216,
"learning_rate": 0.0001,
"loss": 2.6173,
"step": 220
},
{
"epoch": 0.14378659726740403,
"grad_norm": 0.34765559434890747,
"learning_rate": 0.0001,
"loss": 2.8468,
"step": 221
},
{
"epoch": 0.14443721535458687,
"grad_norm": 0.2274564653635025,
"learning_rate": 0.0001,
"loss": 2.1305,
"step": 222
},
{
"epoch": 0.14508783344176968,
"grad_norm": 0.42719507217407227,
"learning_rate": 0.0001,
"loss": 2.3682,
"step": 223
},
{
"epoch": 0.1457384515289525,
"grad_norm": 0.2848481833934784,
"learning_rate": 0.0001,
"loss": 2.0923,
"step": 224
},
{
"epoch": 0.14638906961613532,
"grad_norm": 0.266548752784729,
"learning_rate": 0.0001,
"loss": 2.0393,
"step": 225
},
{
"epoch": 0.14703968770331816,
"grad_norm": 0.24076099693775177,
"learning_rate": 0.0001,
"loss": 2.2674,
"step": 226
},
{
"epoch": 0.14769030579050096,
"grad_norm": 0.23347622156143188,
"learning_rate": 0.0001,
"loss": 1.9455,
"step": 227
},
{
"epoch": 0.1483409238776838,
"grad_norm": 0.3925648033618927,
"learning_rate": 0.0001,
"loss": 2.7117,
"step": 228
},
{
"epoch": 0.14899154196486664,
"grad_norm": 0.27654924988746643,
"learning_rate": 0.0001,
"loss": 2.1306,
"step": 229
},
{
"epoch": 0.14964216005204944,
"grad_norm": 0.2853853702545166,
"learning_rate": 0.0001,
"loss": 2.4369,
"step": 230
},
{
"epoch": 0.15029277813923228,
"grad_norm": 0.4509859085083008,
"learning_rate": 0.0001,
"loss": 2.6047,
"step": 231
},
{
"epoch": 0.1509433962264151,
"grad_norm": 0.2515909671783447,
"learning_rate": 0.0001,
"loss": 2.2065,
"step": 232
},
{
"epoch": 0.15159401431359792,
"grad_norm": 0.5977367162704468,
"learning_rate": 0.0001,
"loss": 2.7133,
"step": 233
},
{
"epoch": 0.15224463240078073,
"grad_norm": 0.30381399393081665,
"learning_rate": 0.0001,
"loss": 2.343,
"step": 234
},
{
"epoch": 0.15289525048796357,
"grad_norm": 0.27204832434654236,
"learning_rate": 0.0001,
"loss": 2.2908,
"step": 235
},
{
"epoch": 0.15354586857514638,
"grad_norm": 0.6246710419654846,
"learning_rate": 0.0001,
"loss": 2.7862,
"step": 236
},
{
"epoch": 0.1541964866623292,
"grad_norm": 0.4803178012371063,
"learning_rate": 0.0001,
"loss": 3.4388,
"step": 237
},
{
"epoch": 0.15484710474951205,
"grad_norm": 0.3038940727710724,
"learning_rate": 0.0001,
"loss": 2.7409,
"step": 238
},
{
"epoch": 0.15549772283669486,
"grad_norm": 0.2494591474533081,
"learning_rate": 0.0001,
"loss": 2.2601,
"step": 239
},
{
"epoch": 0.1561483409238777,
"grad_norm": 0.23808616399765015,
"learning_rate": 0.0001,
"loss": 2.1319,
"step": 240
},
{
"epoch": 0.1567989590110605,
"grad_norm": 0.3111306130886078,
"learning_rate": 0.0001,
"loss": 2.7414,
"step": 241
},
{
"epoch": 0.15744957709824334,
"grad_norm": 0.22197599709033966,
"learning_rate": 0.0001,
"loss": 2.1346,
"step": 242
},
{
"epoch": 0.15810019518542615,
"grad_norm": 0.2681500315666199,
"learning_rate": 0.0001,
"loss": 2.3779,
"step": 243
},
{
"epoch": 0.15875081327260898,
"grad_norm": 0.2612643241882324,
"learning_rate": 0.0001,
"loss": 2.5743,
"step": 244
},
{
"epoch": 0.1594014313597918,
"grad_norm": 0.201397106051445,
"learning_rate": 0.0001,
"loss": 2.0312,
"step": 245
},
{
"epoch": 0.16005204944697463,
"grad_norm": 0.25662410259246826,
"learning_rate": 0.0001,
"loss": 2.5085,
"step": 246
},
{
"epoch": 0.16070266753415746,
"grad_norm": 0.21460294723510742,
"learning_rate": 0.0001,
"loss": 2.1099,
"step": 247
},
{
"epoch": 0.16135328562134027,
"grad_norm": 0.19971312582492828,
"learning_rate": 0.0001,
"loss": 2.1024,
"step": 248
},
{
"epoch": 0.1620039037085231,
"grad_norm": 0.1986059844493866,
"learning_rate": 0.0001,
"loss": 1.9306,
"step": 249
},
{
"epoch": 0.16265452179570591,
"grad_norm": 0.21961884200572968,
"learning_rate": 0.0001,
"loss": 2.1218,
"step": 250
},
{
"epoch": 0.16330513988288875,
"grad_norm": 0.20071017742156982,
"learning_rate": 0.0001,
"loss": 2.0581,
"step": 251
},
{
"epoch": 0.16395575797007156,
"grad_norm": 0.32734909653663635,
"learning_rate": 0.0001,
"loss": 2.6229,
"step": 252
},
{
"epoch": 0.1646063760572544,
"grad_norm": 0.21822451055049896,
"learning_rate": 0.0001,
"loss": 1.9954,
"step": 253
},
{
"epoch": 0.1652569941444372,
"grad_norm": 0.3013177216053009,
"learning_rate": 0.0001,
"loss": 2.454,
"step": 254
},
{
"epoch": 0.16590761223162004,
"grad_norm": 0.31199347972869873,
"learning_rate": 0.0001,
"loss": 2.815,
"step": 255
},
{
"epoch": 0.16655823031880287,
"grad_norm": 0.2255464345216751,
"learning_rate": 0.0001,
"loss": 2.0232,
"step": 256
},
{
"epoch": 0.16720884840598568,
"grad_norm": 0.21208804845809937,
"learning_rate": 0.0001,
"loss": 1.9663,
"step": 257
},
{
"epoch": 0.16785946649316852,
"grad_norm": 0.2432132512331009,
"learning_rate": 0.0001,
"loss": 2.4189,
"step": 258
},
{
"epoch": 0.16851008458035133,
"grad_norm": 0.21116623282432556,
"learning_rate": 0.0001,
"loss": 2.0761,
"step": 259
},
{
"epoch": 0.16916070266753416,
"grad_norm": 0.18722975254058838,
"learning_rate": 0.0001,
"loss": 1.9537,
"step": 260
},
{
"epoch": 0.16981132075471697,
"grad_norm": 0.2683362662792206,
"learning_rate": 0.0001,
"loss": 2.4483,
"step": 261
},
{
"epoch": 0.1704619388418998,
"grad_norm": 0.2739648222923279,
"learning_rate": 0.0001,
"loss": 2.3754,
"step": 262
},
{
"epoch": 0.17111255692908262,
"grad_norm": 0.1836375594139099,
"learning_rate": 0.0001,
"loss": 2.0103,
"step": 263
},
{
"epoch": 0.17176317501626545,
"grad_norm": 0.34002602100372314,
"learning_rate": 0.0001,
"loss": 2.2626,
"step": 264
},
{
"epoch": 0.1724137931034483,
"grad_norm": 0.19341516494750977,
"learning_rate": 0.0001,
"loss": 1.9751,
"step": 265
},
{
"epoch": 0.1730644111906311,
"grad_norm": 0.25080743432044983,
"learning_rate": 0.0001,
"loss": 2.2162,
"step": 266
},
{
"epoch": 0.17371502927781393,
"grad_norm": 0.2362661212682724,
"learning_rate": 0.0001,
"loss": 2.0226,
"step": 267
},
{
"epoch": 0.17436564736499674,
"grad_norm": 0.25844064354896545,
"learning_rate": 0.0001,
"loss": 2.3176,
"step": 268
},
{
"epoch": 0.17501626545217958,
"grad_norm": 0.3904498517513275,
"learning_rate": 0.0001,
"loss": 2.4871,
"step": 269
},
{
"epoch": 0.17566688353936238,
"grad_norm": 0.22143317759037018,
"learning_rate": 0.0001,
"loss": 2.2073,
"step": 270
},
{
"epoch": 0.17631750162654522,
"grad_norm": 0.20974211394786835,
"learning_rate": 0.0001,
"loss": 2.1393,
"step": 271
},
{
"epoch": 0.17696811971372803,
"grad_norm": 0.24463056027889252,
"learning_rate": 0.0001,
"loss": 2.0203,
"step": 272
},
{
"epoch": 0.17761873780091086,
"grad_norm": 0.23296399414539337,
"learning_rate": 0.0001,
"loss": 2.1096,
"step": 273
},
{
"epoch": 0.1782693558880937,
"grad_norm": 0.4122619926929474,
"learning_rate": 0.0001,
"loss": 3.1512,
"step": 274
},
{
"epoch": 0.1789199739752765,
"grad_norm": 0.2744470536708832,
"learning_rate": 0.0001,
"loss": 2.2211,
"step": 275
},
{
"epoch": 0.17957059206245934,
"grad_norm": 0.21010619401931763,
"learning_rate": 0.0001,
"loss": 2.2203,
"step": 276
},
{
"epoch": 0.18022121014964215,
"grad_norm": 0.27855056524276733,
"learning_rate": 0.0001,
"loss": 2.2903,
"step": 277
},
{
"epoch": 0.180871828236825,
"grad_norm": 0.2909989058971405,
"learning_rate": 0.0001,
"loss": 2.237,
"step": 278
},
{
"epoch": 0.1815224463240078,
"grad_norm": 0.21754448115825653,
"learning_rate": 0.0001,
"loss": 2.0138,
"step": 279
},
{
"epoch": 0.18217306441119063,
"grad_norm": 0.35209745168685913,
"learning_rate": 0.0001,
"loss": 2.652,
"step": 280
},
{
"epoch": 0.18282368249837344,
"grad_norm": 0.29994750022888184,
"learning_rate": 0.0001,
"loss": 2.1868,
"step": 281
},
{
"epoch": 0.18347430058555628,
"grad_norm": 0.2645902633666992,
"learning_rate": 0.0001,
"loss": 2.2925,
"step": 282
},
{
"epoch": 0.1841249186727391,
"grad_norm": 0.3492202162742615,
"learning_rate": 0.0001,
"loss": 2.4176,
"step": 283
},
{
"epoch": 0.18477553675992192,
"grad_norm": 0.256651371717453,
"learning_rate": 0.0001,
"loss": 2.3414,
"step": 284
},
{
"epoch": 0.18542615484710476,
"grad_norm": 0.23287786543369293,
"learning_rate": 0.0001,
"loss": 2.5488,
"step": 285
},
{
"epoch": 0.18607677293428757,
"grad_norm": 0.26059290766716003,
"learning_rate": 0.0001,
"loss": 2.4551,
"step": 286
},
{
"epoch": 0.1867273910214704,
"grad_norm": 0.2482365071773529,
"learning_rate": 0.0001,
"loss": 2.0818,
"step": 287
},
{
"epoch": 0.1873780091086532,
"grad_norm": 0.23024773597717285,
"learning_rate": 0.0001,
"loss": 2.2592,
"step": 288
},
{
"epoch": 0.18802862719583605,
"grad_norm": 0.2590011656284332,
"learning_rate": 0.0001,
"loss": 2.4177,
"step": 289
},
{
"epoch": 0.18867924528301888,
"grad_norm": 0.19760870933532715,
"learning_rate": 0.0001,
"loss": 2.0731,
"step": 290
},
{
"epoch": 0.1893298633702017,
"grad_norm": 0.20266428589820862,
"learning_rate": 0.0001,
"loss": 2.1221,
"step": 291
},
{
"epoch": 0.18998048145738453,
"grad_norm": 0.20199884474277496,
"learning_rate": 0.0001,
"loss": 2.0489,
"step": 292
},
{
"epoch": 0.19063109954456733,
"grad_norm": 0.23876360058784485,
"learning_rate": 0.0001,
"loss": 2.1392,
"step": 293
},
{
"epoch": 0.19128171763175017,
"grad_norm": 0.23555997014045715,
"learning_rate": 0.0001,
"loss": 2.4116,
"step": 294
},
{
"epoch": 0.19193233571893298,
"grad_norm": 0.5010725259780884,
"learning_rate": 0.0001,
"loss": 2.7444,
"step": 295
},
{
"epoch": 0.19258295380611581,
"grad_norm": 0.37809622287750244,
"learning_rate": 0.0001,
"loss": 2.2635,
"step": 296
},
{
"epoch": 0.19323357189329862,
"grad_norm": 0.499888151884079,
"learning_rate": 0.0001,
"loss": 2.1984,
"step": 297
},
{
"epoch": 0.19388418998048146,
"grad_norm": 0.43810585141181946,
"learning_rate": 0.0001,
"loss": 3.084,
"step": 298
},
{
"epoch": 0.1945348080676643,
"grad_norm": 0.35633769631385803,
"learning_rate": 0.0001,
"loss": 2.0351,
"step": 299
},
{
"epoch": 0.1951854261548471,
"grad_norm": 0.3693079650402069,
"learning_rate": 0.0001,
"loss": 1.9525,
"step": 300
},
{
"epoch": 0.19583604424202994,
"grad_norm": 0.36550503969192505,
"learning_rate": 0.0001,
"loss": 2.2469,
"step": 301
},
{
"epoch": 0.19648666232921275,
"grad_norm": 0.2579827308654785,
"learning_rate": 0.0001,
"loss": 2.3585,
"step": 302
},
{
"epoch": 0.19713728041639558,
"grad_norm": 0.2603841722011566,
"learning_rate": 0.0001,
"loss": 2.3959,
"step": 303
},
{
"epoch": 0.1977878985035784,
"grad_norm": 0.33103683590888977,
"learning_rate": 0.0001,
"loss": 2.2197,
"step": 304
},
{
"epoch": 0.19843851659076123,
"grad_norm": 0.2977697551250458,
"learning_rate": 0.0001,
"loss": 2.2569,
"step": 305
},
{
"epoch": 0.19908913467794404,
"grad_norm": 0.2085130512714386,
"learning_rate": 0.0001,
"loss": 2.2284,
"step": 306
},
{
"epoch": 0.19973975276512687,
"grad_norm": 0.409212201833725,
"learning_rate": 0.0001,
"loss": 2.7014,
"step": 307
},
{
"epoch": 0.2003903708523097,
"grad_norm": 0.2447553277015686,
"learning_rate": 0.0001,
"loss": 2.2826,
"step": 308
},
{
"epoch": 0.20104098893949252,
"grad_norm": 0.21881726384162903,
"learning_rate": 0.0001,
"loss": 1.8573,
"step": 309
},
{
"epoch": 0.20169160702667535,
"grad_norm": 0.24484936892986298,
"learning_rate": 0.0001,
"loss": 2.318,
"step": 310
},
{
"epoch": 0.20234222511385816,
"grad_norm": 0.3251173198223114,
"learning_rate": 0.0001,
"loss": 2.3346,
"step": 311
},
{
"epoch": 0.202992843201041,
"grad_norm": 0.22313712537288666,
"learning_rate": 0.0001,
"loss": 1.9119,
"step": 312
},
{
"epoch": 0.2036434612882238,
"grad_norm": 0.3086949288845062,
"learning_rate": 0.0001,
"loss": 2.1809,
"step": 313
},
{
"epoch": 0.20429407937540664,
"grad_norm": 0.28272122144699097,
"learning_rate": 0.0001,
"loss": 2.3335,
"step": 314
},
{
"epoch": 0.20494469746258945,
"grad_norm": 0.208637535572052,
"learning_rate": 0.0001,
"loss": 2.1947,
"step": 315
},
{
"epoch": 0.20559531554977228,
"grad_norm": 0.2913041114807129,
"learning_rate": 0.0001,
"loss": 2.3009,
"step": 316
},
{
"epoch": 0.20624593363695512,
"grad_norm": 0.2813785970211029,
"learning_rate": 0.0001,
"loss": 2.0133,
"step": 317
},
{
"epoch": 0.20689655172413793,
"grad_norm": 0.2324337363243103,
"learning_rate": 0.0001,
"loss": 2.0827,
"step": 318
},
{
"epoch": 0.20754716981132076,
"grad_norm": 0.25195491313934326,
"learning_rate": 0.0001,
"loss": 2.5201,
"step": 319
},
{
"epoch": 0.20819778789850357,
"grad_norm": 0.3435034453868866,
"learning_rate": 0.0001,
"loss": 2.321,
"step": 320
},
{
"epoch": 0.2088484059856864,
"grad_norm": 0.2735581696033478,
"learning_rate": 0.0001,
"loss": 2.2218,
"step": 321
},
{
"epoch": 0.20949902407286922,
"grad_norm": 0.2250661551952362,
"learning_rate": 0.0001,
"loss": 1.9416,
"step": 322
},
{
"epoch": 0.21014964216005205,
"grad_norm": 0.3160262107849121,
"learning_rate": 0.0001,
"loss": 2.5494,
"step": 323
},
{
"epoch": 0.21080026024723486,
"grad_norm": 0.3669279217720032,
"learning_rate": 0.0001,
"loss": 2.7751,
"step": 324
},
{
"epoch": 0.2114508783344177,
"grad_norm": 0.2052752673625946,
"learning_rate": 0.0001,
"loss": 2.0139,
"step": 325
},
{
"epoch": 0.21210149642160053,
"grad_norm": 0.2906612455844879,
"learning_rate": 0.0001,
"loss": 2.227,
"step": 326
},
{
"epoch": 0.21275211450878334,
"grad_norm": 0.30327048897743225,
"learning_rate": 0.0001,
"loss": 2.2905,
"step": 327
},
{
"epoch": 0.21340273259596618,
"grad_norm": 0.33950623869895935,
"learning_rate": 0.0001,
"loss": 3.0731,
"step": 328
},
{
"epoch": 0.21405335068314899,
"grad_norm": 0.31319788098335266,
"learning_rate": 0.0001,
"loss": 2.1374,
"step": 329
},
{
"epoch": 0.21470396877033182,
"grad_norm": 0.21442054212093353,
"learning_rate": 0.0001,
"loss": 1.7588,
"step": 330
},
{
"epoch": 0.21535458685751463,
"grad_norm": 0.23125174641609192,
"learning_rate": 0.0001,
"loss": 1.9295,
"step": 331
},
{
"epoch": 0.21600520494469747,
"grad_norm": 0.23220308125019073,
"learning_rate": 0.0001,
"loss": 2.2606,
"step": 332
},
{
"epoch": 0.21665582303188027,
"grad_norm": 0.24599219858646393,
"learning_rate": 0.0001,
"loss": 2.2687,
"step": 333
},
{
"epoch": 0.2173064411190631,
"grad_norm": 0.22226236760616302,
"learning_rate": 0.0001,
"loss": 2.1428,
"step": 334
},
{
"epoch": 0.21795705920624595,
"grad_norm": 0.2653510570526123,
"learning_rate": 0.0001,
"loss": 2.4381,
"step": 335
},
{
"epoch": 0.21860767729342875,
"grad_norm": 0.23770929872989655,
"learning_rate": 0.0001,
"loss": 1.9655,
"step": 336
},
{
"epoch": 0.2192582953806116,
"grad_norm": 0.1932332068681717,
"learning_rate": 0.0001,
"loss": 1.9465,
"step": 337
},
{
"epoch": 0.2199089134677944,
"grad_norm": 0.181661456823349,
"learning_rate": 0.0001,
"loss": 1.9912,
"step": 338
},
{
"epoch": 0.22055953155497723,
"grad_norm": 0.22275297343730927,
"learning_rate": 0.0001,
"loss": 2.1964,
"step": 339
},
{
"epoch": 0.22121014964216004,
"grad_norm": 0.22086840867996216,
"learning_rate": 0.0001,
"loss": 2.2216,
"step": 340
},
{
"epoch": 0.22186076772934288,
"grad_norm": 0.22807130217552185,
"learning_rate": 0.0001,
"loss": 2.2434,
"step": 341
},
{
"epoch": 0.2225113858165257,
"grad_norm": 0.26616647839546204,
"learning_rate": 0.0001,
"loss": 2.442,
"step": 342
},
{
"epoch": 0.22316200390370852,
"grad_norm": 0.2841719388961792,
"learning_rate": 0.0001,
"loss": 2.2358,
"step": 343
},
{
"epoch": 0.22381262199089136,
"grad_norm": 0.23251943290233612,
"learning_rate": 0.0001,
"loss": 2.3436,
"step": 344
},
{
"epoch": 0.22446324007807417,
"grad_norm": 0.20406994223594666,
"learning_rate": 0.0001,
"loss": 2.101,
"step": 345
},
{
"epoch": 0.225113858165257,
"grad_norm": 0.18677304685115814,
"learning_rate": 0.0001,
"loss": 2.0596,
"step": 346
},
{
"epoch": 0.2257644762524398,
"grad_norm": 0.22367873787879944,
"learning_rate": 0.0001,
"loss": 2.2051,
"step": 347
},
{
"epoch": 0.22641509433962265,
"grad_norm": 0.2521246671676636,
"learning_rate": 0.0001,
"loss": 2.1718,
"step": 348
},
{
"epoch": 0.22706571242680545,
"grad_norm": 0.23043319582939148,
"learning_rate": 0.0001,
"loss": 2.2818,
"step": 349
},
{
"epoch": 0.2277163305139883,
"grad_norm": 0.22021251916885376,
"learning_rate": 0.0001,
"loss": 2.0337,
"step": 350
},
{
"epoch": 0.2283669486011711,
"grad_norm": 0.18043603003025055,
"learning_rate": 0.0001,
"loss": 1.9434,
"step": 351
},
{
"epoch": 0.22901756668835394,
"grad_norm": 0.4757142961025238,
"learning_rate": 0.0001,
"loss": 2.2467,
"step": 352
},
{
"epoch": 0.22966818477553677,
"grad_norm": 0.30740290880203247,
"learning_rate": 0.0001,
"loss": 2.5296,
"step": 353
},
{
"epoch": 0.23031880286271958,
"grad_norm": 0.23037666082382202,
"learning_rate": 0.0001,
"loss": 2.311,
"step": 354
},
{
"epoch": 0.23096942094990242,
"grad_norm": 0.22314564883708954,
"learning_rate": 0.0001,
"loss": 2.0494,
"step": 355
},
{
"epoch": 0.23162003903708522,
"grad_norm": 0.21417242288589478,
"learning_rate": 0.0001,
"loss": 2.2459,
"step": 356
},
{
"epoch": 0.23227065712426806,
"grad_norm": 0.2895831763744354,
"learning_rate": 0.0001,
"loss": 2.2705,
"step": 357
},
{
"epoch": 0.23292127521145087,
"grad_norm": 0.2110838145017624,
"learning_rate": 0.0001,
"loss": 2.1175,
"step": 358
},
{
"epoch": 0.2335718932986337,
"grad_norm": 0.3999682664871216,
"learning_rate": 0.0001,
"loss": 2.6891,
"step": 359
},
{
"epoch": 0.2342225113858165,
"grad_norm": 0.5169201493263245,
"learning_rate": 0.0001,
"loss": 2.5764,
"step": 360
},
{
"epoch": 0.23487312947299935,
"grad_norm": 0.24382548034191132,
"learning_rate": 0.0001,
"loss": 2.1065,
"step": 361
},
{
"epoch": 0.23552374756018218,
"grad_norm": 0.2830081582069397,
"learning_rate": 0.0001,
"loss": 2.1186,
"step": 362
},
{
"epoch": 0.236174365647365,
"grad_norm": 0.23680554330348969,
"learning_rate": 0.0001,
"loss": 2.118,
"step": 363
},
{
"epoch": 0.23682498373454783,
"grad_norm": 0.3790690302848816,
"learning_rate": 0.0001,
"loss": 2.3566,
"step": 364
},
{
"epoch": 0.23747560182173064,
"grad_norm": 0.2664685845375061,
"learning_rate": 0.0001,
"loss": 2.2118,
"step": 365
},
{
"epoch": 0.23812621990891347,
"grad_norm": 0.22439126670360565,
"learning_rate": 0.0001,
"loss": 2.0897,
"step": 366
},
{
"epoch": 0.23877683799609628,
"grad_norm": 0.2559892237186432,
"learning_rate": 0.0001,
"loss": 2.2559,
"step": 367
},
{
"epoch": 0.23942745608327912,
"grad_norm": 0.43989577889442444,
"learning_rate": 0.0001,
"loss": 2.5208,
"step": 368
},
{
"epoch": 0.24007807417046195,
"grad_norm": 0.24543894827365875,
"learning_rate": 0.0001,
"loss": 2.1692,
"step": 369
},
{
"epoch": 0.24072869225764476,
"grad_norm": 0.37020954489707947,
"learning_rate": 0.0001,
"loss": 2.1287,
"step": 370
},
{
"epoch": 0.2413793103448276,
"grad_norm": 0.41815564036369324,
"learning_rate": 0.0001,
"loss": 2.5952,
"step": 371
},
{
"epoch": 0.2420299284320104,
"grad_norm": 0.22579136490821838,
"learning_rate": 0.0001,
"loss": 2.2427,
"step": 372
},
{
"epoch": 0.24268054651919324,
"grad_norm": 0.3004798889160156,
"learning_rate": 0.0001,
"loss": 2.2767,
"step": 373
},
{
"epoch": 0.24333116460637605,
"grad_norm": 0.27470141649246216,
"learning_rate": 0.0001,
"loss": 2.092,
"step": 374
},
{
"epoch": 0.24398178269355889,
"grad_norm": 0.25301867723464966,
"learning_rate": 0.0001,
"loss": 2.1816,
"step": 375
},
{
"epoch": 0.2446324007807417,
"grad_norm": 0.21194620430469513,
"learning_rate": 0.0001,
"loss": 2.1322,
"step": 376
},
{
"epoch": 0.24528301886792453,
"grad_norm": 0.28737103939056396,
"learning_rate": 0.0001,
"loss": 2.6685,
"step": 377
},
{
"epoch": 0.24593363695510737,
"grad_norm": 0.28857922554016113,
"learning_rate": 0.0001,
"loss": 2.2219,
"step": 378
},
{
"epoch": 0.24658425504229017,
"grad_norm": 0.29493409395217896,
"learning_rate": 0.0001,
"loss": 2.717,
"step": 379
},
{
"epoch": 0.247234873129473,
"grad_norm": 0.33975929021835327,
"learning_rate": 0.0001,
"loss": 2.3499,
"step": 380
},
{
"epoch": 0.24788549121665582,
"grad_norm": 0.21486152708530426,
"learning_rate": 0.0001,
"loss": 2.306,
"step": 381
},
{
"epoch": 0.24853610930383865,
"grad_norm": 0.2686431109905243,
"learning_rate": 0.0001,
"loss": 2.0942,
"step": 382
},
{
"epoch": 0.24918672739102146,
"grad_norm": 0.2812007963657379,
"learning_rate": 0.0001,
"loss": 2.3729,
"step": 383
},
{
"epoch": 0.2498373454782043,
"grad_norm": 0.31875330209732056,
"learning_rate": 0.0001,
"loss": 2.5766,
"step": 384
},
{
"epoch": 0.2504879635653871,
"grad_norm": 0.2624376714229584,
"learning_rate": 0.0001,
"loss": 2.2057,
"step": 385
},
{
"epoch": 0.2511385816525699,
"grad_norm": 0.265286386013031,
"learning_rate": 0.0001,
"loss": 2.2405,
"step": 386
},
{
"epoch": 0.2517891997397528,
"grad_norm": 0.3202246129512787,
"learning_rate": 0.0001,
"loss": 2.2817,
"step": 387
},
{
"epoch": 0.2524398178269356,
"grad_norm": 0.22770161926746368,
"learning_rate": 0.0001,
"loss": 1.9564,
"step": 388
},
{
"epoch": 0.2530904359141184,
"grad_norm": 0.3313138484954834,
"learning_rate": 0.0001,
"loss": 2.4424,
"step": 389
},
{
"epoch": 0.25374105400130126,
"grad_norm": 0.2961839437484741,
"learning_rate": 0.0001,
"loss": 2.4122,
"step": 390
},
{
"epoch": 0.25439167208848407,
"grad_norm": 0.24270308017730713,
"learning_rate": 0.0001,
"loss": 1.99,
"step": 391
},
{
"epoch": 0.2550422901756669,
"grad_norm": 0.2306670844554901,
"learning_rate": 0.0001,
"loss": 2.3529,
"step": 392
},
{
"epoch": 0.2556929082628497,
"grad_norm": 0.28387176990509033,
"learning_rate": 0.0001,
"loss": 2.0824,
"step": 393
},
{
"epoch": 0.25634352635003255,
"grad_norm": 0.3105824291706085,
"learning_rate": 0.0001,
"loss": 2.437,
"step": 394
},
{
"epoch": 0.25699414443721535,
"grad_norm": 0.1932361125946045,
"learning_rate": 0.0001,
"loss": 1.9747,
"step": 395
},
{
"epoch": 0.25764476252439816,
"grad_norm": 0.31146278977394104,
"learning_rate": 0.0001,
"loss": 2.263,
"step": 396
},
{
"epoch": 0.258295380611581,
"grad_norm": 0.24420365691184998,
"learning_rate": 0.0001,
"loss": 2.015,
"step": 397
},
{
"epoch": 0.25894599869876384,
"grad_norm": 0.24144989252090454,
"learning_rate": 0.0001,
"loss": 2.2536,
"step": 398
},
{
"epoch": 0.25959661678594664,
"grad_norm": 0.3478517532348633,
"learning_rate": 0.0001,
"loss": 2.5835,
"step": 399
},
{
"epoch": 0.26024723487312945,
"grad_norm": 0.24381348490715027,
"learning_rate": 0.0001,
"loss": 2.2439,
"step": 400
},
{
"epoch": 0.2608978529603123,
"grad_norm": 0.2834983468055725,
"learning_rate": 0.0001,
"loss": 2.3991,
"step": 401
},
{
"epoch": 0.2615484710474951,
"grad_norm": 0.28689858317375183,
"learning_rate": 0.0001,
"loss": 1.9156,
"step": 402
},
{
"epoch": 0.26219908913467793,
"grad_norm": 0.23692357540130615,
"learning_rate": 0.0001,
"loss": 2.0189,
"step": 403
},
{
"epoch": 0.26284970722186074,
"grad_norm": 0.30104926228523254,
"learning_rate": 0.0001,
"loss": 2.4945,
"step": 404
},
{
"epoch": 0.2635003253090436,
"grad_norm": 0.23472270369529724,
"learning_rate": 0.0001,
"loss": 1.8892,
"step": 405
},
{
"epoch": 0.2641509433962264,
"grad_norm": 0.31508034467697144,
"learning_rate": 0.0001,
"loss": 2.4935,
"step": 406
},
{
"epoch": 0.2648015614834092,
"grad_norm": 0.25103551149368286,
"learning_rate": 0.0001,
"loss": 2.4428,
"step": 407
},
{
"epoch": 0.2654521795705921,
"grad_norm": 0.2387259602546692,
"learning_rate": 0.0001,
"loss": 2.0989,
"step": 408
},
{
"epoch": 0.2661027976577749,
"grad_norm": 0.2606028616428375,
"learning_rate": 0.0001,
"loss": 1.9494,
"step": 409
},
{
"epoch": 0.2667534157449577,
"grad_norm": 0.25114724040031433,
"learning_rate": 0.0001,
"loss": 2.2432,
"step": 410
},
{
"epoch": 0.2674040338321405,
"grad_norm": 0.3072582483291626,
"learning_rate": 0.0001,
"loss": 2.3506,
"step": 411
},
{
"epoch": 0.2680546519193234,
"grad_norm": 0.23917561769485474,
"learning_rate": 0.0001,
"loss": 2.2665,
"step": 412
},
{
"epoch": 0.2687052700065062,
"grad_norm": 0.2120814174413681,
"learning_rate": 0.0001,
"loss": 1.9625,
"step": 413
},
{
"epoch": 0.269355888093689,
"grad_norm": 0.22003813087940216,
"learning_rate": 0.0001,
"loss": 2.1179,
"step": 414
},
{
"epoch": 0.27000650618087185,
"grad_norm": 0.33217060565948486,
"learning_rate": 0.0001,
"loss": 2.6353,
"step": 415
},
{
"epoch": 0.27065712426805466,
"grad_norm": 0.2260630577802658,
"learning_rate": 0.0001,
"loss": 2.0355,
"step": 416
},
{
"epoch": 0.27130774235523747,
"grad_norm": 0.30081093311309814,
"learning_rate": 0.0001,
"loss": 2.1825,
"step": 417
},
{
"epoch": 0.2719583604424203,
"grad_norm": 0.27275893092155457,
"learning_rate": 0.0001,
"loss": 2.6183,
"step": 418
},
{
"epoch": 0.27260897852960314,
"grad_norm": 0.4902358651161194,
"learning_rate": 0.0001,
"loss": 3.0888,
"step": 419
},
{
"epoch": 0.27325959661678595,
"grad_norm": 0.21213112771511078,
"learning_rate": 0.0001,
"loss": 2.1172,
"step": 420
},
{
"epoch": 0.27391021470396876,
"grad_norm": 0.35953450202941895,
"learning_rate": 0.0001,
"loss": 2.5109,
"step": 421
},
{
"epoch": 0.2745608327911516,
"grad_norm": 0.2081584334373474,
"learning_rate": 0.0001,
"loss": 2.0894,
"step": 422
},
{
"epoch": 0.27521145087833443,
"grad_norm": 0.20892906188964844,
"learning_rate": 0.0001,
"loss": 1.9643,
"step": 423
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.30058735609054565,
"learning_rate": 0.0001,
"loss": 2.6503,
"step": 424
},
{
"epoch": 0.27651268705270005,
"grad_norm": 0.32902124524116516,
"learning_rate": 0.0001,
"loss": 2.3271,
"step": 425
},
{
"epoch": 0.2771633051398829,
"grad_norm": 0.2003614902496338,
"learning_rate": 0.0001,
"loss": 1.9881,
"step": 426
},
{
"epoch": 0.2778139232270657,
"grad_norm": 0.33349111676216125,
"learning_rate": 0.0001,
"loss": 2.7625,
"step": 427
},
{
"epoch": 0.2784645413142485,
"grad_norm": 0.25051257014274597,
"learning_rate": 0.0001,
"loss": 2.0825,
"step": 428
},
{
"epoch": 0.27911515940143133,
"grad_norm": 0.3301559388637543,
"learning_rate": 0.0001,
"loss": 2.85,
"step": 429
},
{
"epoch": 0.2797657774886142,
"grad_norm": 0.18224254250526428,
"learning_rate": 0.0001,
"loss": 1.9687,
"step": 430
},
{
"epoch": 0.280416395575797,
"grad_norm": 0.21809989213943481,
"learning_rate": 0.0001,
"loss": 2.2596,
"step": 431
},
{
"epoch": 0.2810670136629798,
"grad_norm": 0.2473779171705246,
"learning_rate": 0.0001,
"loss": 2.2042,
"step": 432
},
{
"epoch": 0.2817176317501627,
"grad_norm": 0.20744885504245758,
"learning_rate": 0.0001,
"loss": 2.1546,
"step": 433
},
{
"epoch": 0.2823682498373455,
"grad_norm": 0.2620698809623718,
"learning_rate": 0.0001,
"loss": 2.5195,
"step": 434
},
{
"epoch": 0.2830188679245283,
"grad_norm": 0.291421115398407,
"learning_rate": 0.0001,
"loss": 2.4983,
"step": 435
},
{
"epoch": 0.2836694860117111,
"grad_norm": 0.3294708728790283,
"learning_rate": 0.0001,
"loss": 2.3146,
"step": 436
},
{
"epoch": 0.28432010409889397,
"grad_norm": 0.26191362738609314,
"learning_rate": 0.0001,
"loss": 2.2818,
"step": 437
},
{
"epoch": 0.2849707221860768,
"grad_norm": 0.29155483841896057,
"learning_rate": 0.0001,
"loss": 2.4888,
"step": 438
},
{
"epoch": 0.2856213402732596,
"grad_norm": 0.19482360780239105,
"learning_rate": 0.0001,
"loss": 2.0061,
"step": 439
},
{
"epoch": 0.28627195836044245,
"grad_norm": 0.2594612240791321,
"learning_rate": 0.0001,
"loss": 2.1891,
"step": 440
},
{
"epoch": 0.28692257644762525,
"grad_norm": 0.21656309068202972,
"learning_rate": 0.0001,
"loss": 1.7911,
"step": 441
},
{
"epoch": 0.28757319453480806,
"grad_norm": 0.18664829432964325,
"learning_rate": 0.0001,
"loss": 1.9634,
"step": 442
},
{
"epoch": 0.28822381262199087,
"grad_norm": 0.2178332507610321,
"learning_rate": 0.0001,
"loss": 2.32,
"step": 443
},
{
"epoch": 0.28887443070917374,
"grad_norm": 0.351418673992157,
"learning_rate": 0.0001,
"loss": 3.0873,
"step": 444
},
{
"epoch": 0.28952504879635654,
"grad_norm": 0.23604457080364227,
"learning_rate": 0.0001,
"loss": 2.46,
"step": 445
},
{
"epoch": 0.29017566688353935,
"grad_norm": 0.2599848806858063,
"learning_rate": 0.0001,
"loss": 2.0207,
"step": 446
},
{
"epoch": 0.29082628497072216,
"grad_norm": 0.340314120054245,
"learning_rate": 0.0001,
"loss": 2.279,
"step": 447
},
{
"epoch": 0.291476903057905,
"grad_norm": 0.23228399455547333,
"learning_rate": 0.0001,
"loss": 2.3561,
"step": 448
},
{
"epoch": 0.29212752114508783,
"grad_norm": 0.25504687428474426,
"learning_rate": 0.0001,
"loss": 2.2251,
"step": 449
},
{
"epoch": 0.29277813923227064,
"grad_norm": 0.2465014010667801,
"learning_rate": 0.0001,
"loss": 2.1031,
"step": 450
},
{
"epoch": 0.2934287573194535,
"grad_norm": 0.2188328504562378,
"learning_rate": 0.0001,
"loss": 2.1483,
"step": 451
},
{
"epoch": 0.2940793754066363,
"grad_norm": 0.24546551704406738,
"learning_rate": 0.0001,
"loss": 2.2334,
"step": 452
},
{
"epoch": 0.2947299934938191,
"grad_norm": 0.23416215181350708,
"learning_rate": 0.0001,
"loss": 2.1846,
"step": 453
},
{
"epoch": 0.29538061158100193,
"grad_norm": 0.25267231464385986,
"learning_rate": 0.0001,
"loss": 2.2134,
"step": 454
},
{
"epoch": 0.2960312296681848,
"grad_norm": 0.26632416248321533,
"learning_rate": 0.0001,
"loss": 2.5012,
"step": 455
},
{
"epoch": 0.2966818477553676,
"grad_norm": 0.18289139866828918,
"learning_rate": 0.0001,
"loss": 2.0524,
"step": 456
},
{
"epoch": 0.2973324658425504,
"grad_norm": 0.19033563137054443,
"learning_rate": 0.0001,
"loss": 2.0165,
"step": 457
},
{
"epoch": 0.2979830839297333,
"grad_norm": 0.200730562210083,
"learning_rate": 0.0001,
"loss": 1.8021,
"step": 458
},
{
"epoch": 0.2986337020169161,
"grad_norm": 0.2109062522649765,
"learning_rate": 0.0001,
"loss": 2.0655,
"step": 459
},
{
"epoch": 0.2992843201040989,
"grad_norm": 0.23461318016052246,
"learning_rate": 0.0001,
"loss": 2.3335,
"step": 460
},
{
"epoch": 0.2999349381912817,
"grad_norm": 0.2085726112127304,
"learning_rate": 0.0001,
"loss": 2.0061,
"step": 461
},
{
"epoch": 0.30058555627846456,
"grad_norm": 0.2938329875469208,
"learning_rate": 0.0001,
"loss": 2.5245,
"step": 462
},
{
"epoch": 0.30123617436564737,
"grad_norm": 0.22131232917308807,
"learning_rate": 0.0001,
"loss": 2.4115,
"step": 463
},
{
"epoch": 0.3018867924528302,
"grad_norm": 0.3459152579307556,
"learning_rate": 0.0001,
"loss": 2.3896,
"step": 464
},
{
"epoch": 0.302537410540013,
"grad_norm": 0.27464184165000916,
"learning_rate": 0.0001,
"loss": 2.6592,
"step": 465
},
{
"epoch": 0.30318802862719585,
"grad_norm": 0.28379327058792114,
"learning_rate": 0.0001,
"loss": 2.1453,
"step": 466
},
{
"epoch": 0.30383864671437866,
"grad_norm": 0.28283926844596863,
"learning_rate": 0.0001,
"loss": 2.1704,
"step": 467
},
{
"epoch": 0.30448926480156147,
"grad_norm": 0.22243599593639374,
"learning_rate": 0.0001,
"loss": 2.1175,
"step": 468
},
{
"epoch": 0.30513988288874433,
"grad_norm": 0.22331124544143677,
"learning_rate": 0.0001,
"loss": 1.8857,
"step": 469
},
{
"epoch": 0.30579050097592714,
"grad_norm": 0.21995989978313446,
"learning_rate": 0.0001,
"loss": 2.1316,
"step": 470
},
{
"epoch": 0.30644111906310995,
"grad_norm": 0.21140341460704803,
"learning_rate": 0.0001,
"loss": 2.0742,
"step": 471
},
{
"epoch": 0.30709173715029275,
"grad_norm": 0.31053757667541504,
"learning_rate": 0.0001,
"loss": 2.615,
"step": 472
},
{
"epoch": 0.3077423552374756,
"grad_norm": 0.2768484354019165,
"learning_rate": 0.0001,
"loss": 2.713,
"step": 473
},
{
"epoch": 0.3083929733246584,
"grad_norm": 0.2538318336009979,
"learning_rate": 0.0001,
"loss": 2.1917,
"step": 474
},
{
"epoch": 0.30904359141184123,
"grad_norm": 0.2105240672826767,
"learning_rate": 0.0001,
"loss": 2.2741,
"step": 475
},
{
"epoch": 0.3096942094990241,
"grad_norm": 0.2915903925895691,
"learning_rate": 0.0001,
"loss": 2.115,
"step": 476
},
{
"epoch": 0.3103448275862069,
"grad_norm": 0.30282047390937805,
"learning_rate": 0.0001,
"loss": 2.7806,
"step": 477
},
{
"epoch": 0.3109954456733897,
"grad_norm": 0.2707601487636566,
"learning_rate": 0.0001,
"loss": 2.6137,
"step": 478
},
{
"epoch": 0.3116460637605725,
"grad_norm": 0.34574300050735474,
"learning_rate": 0.0001,
"loss": 2.5957,
"step": 479
},
{
"epoch": 0.3122966818477554,
"grad_norm": 0.22767509520053864,
"learning_rate": 0.0001,
"loss": 2.3543,
"step": 480
},
{
"epoch": 0.3129472999349382,
"grad_norm": 0.25194215774536133,
"learning_rate": 0.0001,
"loss": 2.6586,
"step": 481
},
{
"epoch": 0.313597918022121,
"grad_norm": 0.20427219569683075,
"learning_rate": 0.0001,
"loss": 1.9091,
"step": 482
},
{
"epoch": 0.3142485361093038,
"grad_norm": 0.2993704378604889,
"learning_rate": 0.0001,
"loss": 2.4704,
"step": 483
},
{
"epoch": 0.3148991541964867,
"grad_norm": 0.18951758742332458,
"learning_rate": 0.0001,
"loss": 2.1108,
"step": 484
},
{
"epoch": 0.3155497722836695,
"grad_norm": 0.2622709572315216,
"learning_rate": 0.0001,
"loss": 2.4144,
"step": 485
},
{
"epoch": 0.3162003903708523,
"grad_norm": 0.20735126733779907,
"learning_rate": 0.0001,
"loss": 2.3065,
"step": 486
},
{
"epoch": 0.31685100845803515,
"grad_norm": 0.22782085835933685,
"learning_rate": 0.0001,
"loss": 2.4377,
"step": 487
},
{
"epoch": 0.31750162654521796,
"grad_norm": 0.2568935453891754,
"learning_rate": 0.0001,
"loss": 2.1199,
"step": 488
},
{
"epoch": 0.31815224463240077,
"grad_norm": 0.23917409777641296,
"learning_rate": 0.0001,
"loss": 2.2457,
"step": 489
},
{
"epoch": 0.3188028627195836,
"grad_norm": 0.21531902253627777,
"learning_rate": 0.0001,
"loss": 2.0489,
"step": 490
},
{
"epoch": 0.31945348080676644,
"grad_norm": 0.21461109817028046,
"learning_rate": 0.0001,
"loss": 2.1915,
"step": 491
},
{
"epoch": 0.32010409889394925,
"grad_norm": 0.2458680123090744,
"learning_rate": 0.0001,
"loss": 2.3939,
"step": 492
},
{
"epoch": 0.32075471698113206,
"grad_norm": 0.2617323696613312,
"learning_rate": 0.0001,
"loss": 2.5611,
"step": 493
},
{
"epoch": 0.3214053350683149,
"grad_norm": 0.22562618553638458,
"learning_rate": 0.0001,
"loss": 2.2703,
"step": 494
},
{
"epoch": 0.32205595315549773,
"grad_norm": 0.2290688008069992,
"learning_rate": 0.0001,
"loss": 2.3049,
"step": 495
},
{
"epoch": 0.32270657124268054,
"grad_norm": 0.4118833541870117,
"learning_rate": 0.0001,
"loss": 2.9194,
"step": 496
},
{
"epoch": 0.32335718932986335,
"grad_norm": 0.22502999007701874,
"learning_rate": 0.0001,
"loss": 2.2362,
"step": 497
},
{
"epoch": 0.3240078074170462,
"grad_norm": 0.23599191009998322,
"learning_rate": 0.0001,
"loss": 2.35,
"step": 498
},
{
"epoch": 0.324658425504229,
"grad_norm": 0.3065047860145569,
"learning_rate": 0.0001,
"loss": 2.3984,
"step": 499
},
{
"epoch": 0.32530904359141183,
"grad_norm": 0.19241982698440552,
"learning_rate": 0.0001,
"loss": 1.8787,
"step": 500
},
{
"epoch": 0.3259596616785947,
"grad_norm": 0.20695632696151733,
"learning_rate": 0.0001,
"loss": 1.9397,
"step": 501
},
{
"epoch": 0.3266102797657775,
"grad_norm": 0.1998564749956131,
"learning_rate": 0.0001,
"loss": 2.1463,
"step": 502
},
{
"epoch": 0.3272608978529603,
"grad_norm": 0.27775317430496216,
"learning_rate": 0.0001,
"loss": 2.7956,
"step": 503
},
{
"epoch": 0.3279115159401431,
"grad_norm": 0.2393936961889267,
"learning_rate": 0.0001,
"loss": 2.3785,
"step": 504
},
{
"epoch": 0.328562134027326,
"grad_norm": 0.20921163260936737,
"learning_rate": 0.0001,
"loss": 2.1909,
"step": 505
},
{
"epoch": 0.3292127521145088,
"grad_norm": 0.25875911116600037,
"learning_rate": 0.0001,
"loss": 2.129,
"step": 506
},
{
"epoch": 0.3298633702016916,
"grad_norm": 0.2382909208536148,
"learning_rate": 0.0001,
"loss": 2.3786,
"step": 507
},
{
"epoch": 0.3305139882888744,
"grad_norm": 0.19657136499881744,
"learning_rate": 0.0001,
"loss": 1.951,
"step": 508
},
{
"epoch": 0.33116460637605727,
"grad_norm": 0.23688004910945892,
"learning_rate": 0.0001,
"loss": 2.4348,
"step": 509
},
{
"epoch": 0.3318152244632401,
"grad_norm": 0.1988734006881714,
"learning_rate": 0.0001,
"loss": 2.2352,
"step": 510
},
{
"epoch": 0.3324658425504229,
"grad_norm": 0.2078763097524643,
"learning_rate": 0.0001,
"loss": 2.1376,
"step": 511
},
{
"epoch": 0.33311646063760575,
"grad_norm": 0.18860888481140137,
"learning_rate": 0.0001,
"loss": 1.9367,
"step": 512
},
{
"epoch": 0.33376707872478856,
"grad_norm": 0.30205249786376953,
"learning_rate": 0.0001,
"loss": 2.6822,
"step": 513
},
{
"epoch": 0.33441769681197137,
"grad_norm": 0.2146618664264679,
"learning_rate": 0.0001,
"loss": 2.1927,
"step": 514
},
{
"epoch": 0.3350683148991542,
"grad_norm": 0.19332504272460938,
"learning_rate": 0.0001,
"loss": 2.0442,
"step": 515
},
{
"epoch": 0.33571893298633704,
"grad_norm": 0.2289431244134903,
"learning_rate": 0.0001,
"loss": 2.0152,
"step": 516
},
{
"epoch": 0.33636955107351985,
"grad_norm": 0.21815945208072662,
"learning_rate": 0.0001,
"loss": 2.0015,
"step": 517
},
{
"epoch": 0.33702016916070265,
"grad_norm": 0.2226189821958542,
"learning_rate": 0.0001,
"loss": 2.2989,
"step": 518
},
{
"epoch": 0.3376707872478855,
"grad_norm": 0.22195078432559967,
"learning_rate": 0.0001,
"loss": 2.2237,
"step": 519
},
{
"epoch": 0.3383214053350683,
"grad_norm": 0.1946515589952469,
"learning_rate": 0.0001,
"loss": 1.9459,
"step": 520
},
{
"epoch": 0.33897202342225113,
"grad_norm": 0.21510568261146545,
"learning_rate": 0.0001,
"loss": 2.1305,
"step": 521
},
{
"epoch": 0.33962264150943394,
"grad_norm": 0.23448903858661652,
"learning_rate": 0.0001,
"loss": 2.1838,
"step": 522
},
{
"epoch": 0.3402732595966168,
"grad_norm": 0.19046911597251892,
"learning_rate": 0.0001,
"loss": 1.9739,
"step": 523
},
{
"epoch": 0.3409238776837996,
"grad_norm": 0.2314033806324005,
"learning_rate": 0.0001,
"loss": 2.2053,
"step": 524
},
{
"epoch": 0.3415744957709824,
"grad_norm": 0.2206612378358841,
"learning_rate": 0.0001,
"loss": 2.2566,
"step": 525
},
{
"epoch": 0.34222511385816523,
"grad_norm": 0.19578076899051666,
"learning_rate": 0.0001,
"loss": 2.045,
"step": 526
},
{
"epoch": 0.3428757319453481,
"grad_norm": 0.1787755936384201,
"learning_rate": 0.0001,
"loss": 1.8942,
"step": 527
},
{
"epoch": 0.3435263500325309,
"grad_norm": 0.20091751217842102,
"learning_rate": 0.0001,
"loss": 2.1576,
"step": 528
},
{
"epoch": 0.3441769681197137,
"grad_norm": 0.21869762241840363,
"learning_rate": 0.0001,
"loss": 2.1938,
"step": 529
},
{
"epoch": 0.3448275862068966,
"grad_norm": 0.26101449131965637,
"learning_rate": 0.0001,
"loss": 2.3642,
"step": 530
},
{
"epoch": 0.3454782042940794,
"grad_norm": 0.21874766051769257,
"learning_rate": 0.0001,
"loss": 2.4553,
"step": 531
},
{
"epoch": 0.3461288223812622,
"grad_norm": 0.224325492978096,
"learning_rate": 0.0001,
"loss": 2.2959,
"step": 532
},
{
"epoch": 0.346779440468445,
"grad_norm": 0.21268363296985626,
"learning_rate": 0.0001,
"loss": 2.1021,
"step": 533
},
{
"epoch": 0.34743005855562786,
"grad_norm": 0.20979231595993042,
"learning_rate": 0.0001,
"loss": 2.0304,
"step": 534
},
{
"epoch": 0.34808067664281067,
"grad_norm": 0.19552691280841827,
"learning_rate": 0.0001,
"loss": 1.9747,
"step": 535
},
{
"epoch": 0.3487312947299935,
"grad_norm": 0.27929842472076416,
"learning_rate": 0.0001,
"loss": 2.445,
"step": 536
},
{
"epoch": 0.34938191281717634,
"grad_norm": 0.19953188300132751,
"learning_rate": 0.0001,
"loss": 1.9766,
"step": 537
},
{
"epoch": 0.35003253090435915,
"grad_norm": 0.29898926615715027,
"learning_rate": 0.0001,
"loss": 2.4818,
"step": 538
},
{
"epoch": 0.35068314899154196,
"grad_norm": 0.18719644844532013,
"learning_rate": 0.0001,
"loss": 1.9046,
"step": 539
},
{
"epoch": 0.35133376707872477,
"grad_norm": 0.2602563798427582,
"learning_rate": 0.0001,
"loss": 2.1539,
"step": 540
},
{
"epoch": 0.35198438516590763,
"grad_norm": 0.23460406064987183,
"learning_rate": 0.0001,
"loss": 2.3826,
"step": 541
},
{
"epoch": 0.35263500325309044,
"grad_norm": 0.2821134328842163,
"learning_rate": 0.0001,
"loss": 2.223,
"step": 542
},
{
"epoch": 0.35328562134027325,
"grad_norm": 0.2641044557094574,
"learning_rate": 0.0001,
"loss": 2.2402,
"step": 543
},
{
"epoch": 0.35393623942745606,
"grad_norm": 0.21963565051555634,
"learning_rate": 0.0001,
"loss": 2.3988,
"step": 544
},
{
"epoch": 0.3545868575146389,
"grad_norm": 0.26475685834884644,
"learning_rate": 0.0001,
"loss": 2.3046,
"step": 545
},
{
"epoch": 0.35523747560182173,
"grad_norm": 0.27148157358169556,
"learning_rate": 0.0001,
"loss": 2.5076,
"step": 546
},
{
"epoch": 0.35588809368900454,
"grad_norm": 0.28925588726997375,
"learning_rate": 0.0001,
"loss": 2.8395,
"step": 547
},
{
"epoch": 0.3565387117761874,
"grad_norm": 0.22953632473945618,
"learning_rate": 0.0001,
"loss": 2.1198,
"step": 548
},
{
"epoch": 0.3571893298633702,
"grad_norm": 0.23960557579994202,
"learning_rate": 0.0001,
"loss": 2.3064,
"step": 549
},
{
"epoch": 0.357839947950553,
"grad_norm": 0.3133333921432495,
"learning_rate": 0.0001,
"loss": 2.6034,
"step": 550
},
{
"epoch": 0.3584905660377358,
"grad_norm": 0.21745215356349945,
"learning_rate": 0.0001,
"loss": 2.4553,
"step": 551
},
{
"epoch": 0.3591411841249187,
"grad_norm": 0.23547130823135376,
"learning_rate": 0.0001,
"loss": 2.0469,
"step": 552
},
{
"epoch": 0.3597918022121015,
"grad_norm": 0.2646094262599945,
"learning_rate": 0.0001,
"loss": 1.9016,
"step": 553
},
{
"epoch": 0.3604424202992843,
"grad_norm": 0.3079530596733093,
"learning_rate": 0.0001,
"loss": 2.8979,
"step": 554
},
{
"epoch": 0.36109303838646717,
"grad_norm": 0.38223740458488464,
"learning_rate": 0.0001,
"loss": 3.066,
"step": 555
},
{
"epoch": 0.36174365647365,
"grad_norm": 0.2535337209701538,
"learning_rate": 0.0001,
"loss": 2.1327,
"step": 556
},
{
"epoch": 0.3623942745608328,
"grad_norm": 0.2373637855052948,
"learning_rate": 0.0001,
"loss": 2.1141,
"step": 557
},
{
"epoch": 0.3630448926480156,
"grad_norm": 0.19437271356582642,
"learning_rate": 0.0001,
"loss": 1.9753,
"step": 558
},
{
"epoch": 0.36369551073519846,
"grad_norm": 0.20236878097057343,
"learning_rate": 0.0001,
"loss": 2.2516,
"step": 559
},
{
"epoch": 0.36434612882238127,
"grad_norm": 0.21252363920211792,
"learning_rate": 0.0001,
"loss": 2.3645,
"step": 560
},
{
"epoch": 0.3649967469095641,
"grad_norm": 0.21689258515834808,
"learning_rate": 0.0001,
"loss": 2.1145,
"step": 561
},
{
"epoch": 0.3656473649967469,
"grad_norm": 0.22365228831768036,
"learning_rate": 0.0001,
"loss": 2.3083,
"step": 562
},
{
"epoch": 0.36629798308392975,
"grad_norm": 0.21607807278633118,
"learning_rate": 0.0001,
"loss": 2.3199,
"step": 563
},
{
"epoch": 0.36694860117111255,
"grad_norm": 0.1885683536529541,
"learning_rate": 0.0001,
"loss": 1.9303,
"step": 564
},
{
"epoch": 0.36759921925829536,
"grad_norm": 0.20064905285835266,
"learning_rate": 0.0001,
"loss": 2.0661,
"step": 565
},
{
"epoch": 0.3682498373454782,
"grad_norm": 0.23532240092754364,
"learning_rate": 0.0001,
"loss": 2.6942,
"step": 566
},
{
"epoch": 0.36890045543266103,
"grad_norm": 0.22937807440757751,
"learning_rate": 0.0001,
"loss": 2.1962,
"step": 567
},
{
"epoch": 0.36955107351984384,
"grad_norm": 0.2540866732597351,
"learning_rate": 0.0001,
"loss": 2.5012,
"step": 568
},
{
"epoch": 0.37020169160702665,
"grad_norm": 0.23405294120311737,
"learning_rate": 0.0001,
"loss": 2.2439,
"step": 569
},
{
"epoch": 0.3708523096942095,
"grad_norm": 0.24394820630550385,
"learning_rate": 0.0001,
"loss": 2.0741,
"step": 570
},
{
"epoch": 0.3715029277813923,
"grad_norm": 0.2063736468553543,
"learning_rate": 0.0001,
"loss": 2.0864,
"step": 571
},
{
"epoch": 0.37215354586857513,
"grad_norm": 0.3300686180591583,
"learning_rate": 0.0001,
"loss": 2.4983,
"step": 572
},
{
"epoch": 0.372804163955758,
"grad_norm": 0.21294772624969482,
"learning_rate": 0.0001,
"loss": 2.2273,
"step": 573
},
{
"epoch": 0.3734547820429408,
"grad_norm": 0.2629190981388092,
"learning_rate": 0.0001,
"loss": 2.1732,
"step": 574
},
{
"epoch": 0.3741054001301236,
"grad_norm": 0.2141999751329422,
"learning_rate": 0.0001,
"loss": 2.3038,
"step": 575
},
{
"epoch": 0.3747560182173064,
"grad_norm": 0.3467566668987274,
"learning_rate": 0.0001,
"loss": 2.7748,
"step": 576
},
{
"epoch": 0.3754066363044893,
"grad_norm": 0.3112248182296753,
"learning_rate": 0.0001,
"loss": 2.2376,
"step": 577
},
{
"epoch": 0.3760572543916721,
"grad_norm": 0.21217738091945648,
"learning_rate": 0.0001,
"loss": 1.9146,
"step": 578
},
{
"epoch": 0.3767078724788549,
"grad_norm": 0.19359458982944489,
"learning_rate": 0.0001,
"loss": 2.0913,
"step": 579
},
{
"epoch": 0.37735849056603776,
"grad_norm": 0.27635738253593445,
"learning_rate": 0.0001,
"loss": 2.2855,
"step": 580
},
{
"epoch": 0.37800910865322057,
"grad_norm": 0.19366882741451263,
"learning_rate": 0.0001,
"loss": 2.0194,
"step": 581
},
{
"epoch": 0.3786597267404034,
"grad_norm": 0.2016839236021042,
"learning_rate": 0.0001,
"loss": 2.1519,
"step": 582
},
{
"epoch": 0.3793103448275862,
"grad_norm": 0.22154097259044647,
"learning_rate": 0.0001,
"loss": 1.9849,
"step": 583
},
{
"epoch": 0.37996096291476905,
"grad_norm": 0.2089187502861023,
"learning_rate": 0.0001,
"loss": 2.3624,
"step": 584
},
{
"epoch": 0.38061158100195186,
"grad_norm": 0.25050756335258484,
"learning_rate": 0.0001,
"loss": 2.1773,
"step": 585
},
{
"epoch": 0.38126219908913467,
"grad_norm": 0.23007918894290924,
"learning_rate": 0.0001,
"loss": 2.2054,
"step": 586
},
{
"epoch": 0.3819128171763175,
"grad_norm": 0.25022968649864197,
"learning_rate": 0.0001,
"loss": 2.219,
"step": 587
},
{
"epoch": 0.38256343526350034,
"grad_norm": 0.2205193042755127,
"learning_rate": 0.0001,
"loss": 2.2049,
"step": 588
},
{
"epoch": 0.38321405335068315,
"grad_norm": 0.21454961597919464,
"learning_rate": 0.0001,
"loss": 2.0683,
"step": 589
},
{
"epoch": 0.38386467143786596,
"grad_norm": 0.2088347226381302,
"learning_rate": 0.0001,
"loss": 2.1301,
"step": 590
},
{
"epoch": 0.3845152895250488,
"grad_norm": 0.20322394371032715,
"learning_rate": 0.0001,
"loss": 2.2098,
"step": 591
},
{
"epoch": 0.38516590761223163,
"grad_norm": 0.231514111161232,
"learning_rate": 0.0001,
"loss": 2.5523,
"step": 592
},
{
"epoch": 0.38581652569941444,
"grad_norm": 0.24791982769966125,
"learning_rate": 0.0001,
"loss": 2.2259,
"step": 593
},
{
"epoch": 0.38646714378659724,
"grad_norm": 0.21148578822612762,
"learning_rate": 0.0001,
"loss": 2.0834,
"step": 594
},
{
"epoch": 0.3871177618737801,
"grad_norm": 0.263713538646698,
"learning_rate": 0.0001,
"loss": 2.3101,
"step": 595
},
{
"epoch": 0.3877683799609629,
"grad_norm": 0.22197774052619934,
"learning_rate": 0.0001,
"loss": 2.1173,
"step": 596
},
{
"epoch": 0.3884189980481457,
"grad_norm": 0.2237439900636673,
"learning_rate": 0.0001,
"loss": 2.1109,
"step": 597
},
{
"epoch": 0.3890696161353286,
"grad_norm": 0.27451419830322266,
"learning_rate": 0.0001,
"loss": 2.5311,
"step": 598
},
{
"epoch": 0.3897202342225114,
"grad_norm": 0.18475750088691711,
"learning_rate": 0.0001,
"loss": 1.9241,
"step": 599
},
{
"epoch": 0.3903708523096942,
"grad_norm": 0.20120149850845337,
"learning_rate": 0.0001,
"loss": 2.1033,
"step": 600
},
{
"epoch": 0.391021470396877,
"grad_norm": 0.19626259803771973,
"learning_rate": 0.0001,
"loss": 2.1223,
"step": 601
},
{
"epoch": 0.3916720884840599,
"grad_norm": 0.22795897722244263,
"learning_rate": 0.0001,
"loss": 2.2021,
"step": 602
},
{
"epoch": 0.3923227065712427,
"grad_norm": 0.5195867419242859,
"learning_rate": 0.0001,
"loss": 3.1849,
"step": 603
},
{
"epoch": 0.3929733246584255,
"grad_norm": 0.2636241614818573,
"learning_rate": 0.0001,
"loss": 2.0739,
"step": 604
},
{
"epoch": 0.3936239427456083,
"grad_norm": 0.33922895789146423,
"learning_rate": 0.0001,
"loss": 2.31,
"step": 605
},
{
"epoch": 0.39427456083279117,
"grad_norm": 0.17467042803764343,
"learning_rate": 0.0001,
"loss": 1.9201,
"step": 606
},
{
"epoch": 0.394925178919974,
"grad_norm": 0.22457371652126312,
"learning_rate": 0.0001,
"loss": 1.9783,
"step": 607
},
{
"epoch": 0.3955757970071568,
"grad_norm": 0.5104444026947021,
"learning_rate": 0.0001,
"loss": 2.3777,
"step": 608
},
{
"epoch": 0.39622641509433965,
"grad_norm": 0.4531616270542145,
"learning_rate": 0.0001,
"loss": 2.8208,
"step": 609
},
{
"epoch": 0.39687703318152245,
"grad_norm": 0.20649151504039764,
"learning_rate": 0.0001,
"loss": 2.1377,
"step": 610
},
{
"epoch": 0.39752765126870526,
"grad_norm": 0.39769667387008667,
"learning_rate": 0.0001,
"loss": 2.2228,
"step": 611
},
{
"epoch": 0.39817826935588807,
"grad_norm": 0.2832731008529663,
"learning_rate": 0.0001,
"loss": 1.9664,
"step": 612
},
{
"epoch": 0.39882888744307093,
"grad_norm": 0.2754386067390442,
"learning_rate": 0.0001,
"loss": 2.5595,
"step": 613
},
{
"epoch": 0.39947950553025374,
"grad_norm": 0.404364675283432,
"learning_rate": 0.0001,
"loss": 2.8133,
"step": 614
},
{
"epoch": 0.40013012361743655,
"grad_norm": 0.30304789543151855,
"learning_rate": 0.0001,
"loss": 2.2729,
"step": 615
},
{
"epoch": 0.4007807417046194,
"grad_norm": 0.2519910931587219,
"learning_rate": 0.0001,
"loss": 2.3655,
"step": 616
},
{
"epoch": 0.4014313597918022,
"grad_norm": 0.2863995134830475,
"learning_rate": 0.0001,
"loss": 2.0774,
"step": 617
},
{
"epoch": 0.40208197787898503,
"grad_norm": 0.393622487783432,
"learning_rate": 0.0001,
"loss": 2.5082,
"step": 618
},
{
"epoch": 0.40273259596616784,
"grad_norm": 0.21836060285568237,
"learning_rate": 0.0001,
"loss": 1.9548,
"step": 619
},
{
"epoch": 0.4033832140533507,
"grad_norm": 0.358052521944046,
"learning_rate": 0.0001,
"loss": 2.5158,
"step": 620
},
{
"epoch": 0.4040338321405335,
"grad_norm": 0.237140953540802,
"learning_rate": 0.0001,
"loss": 2.2111,
"step": 621
},
{
"epoch": 0.4046844502277163,
"grad_norm": 0.20998883247375488,
"learning_rate": 0.0001,
"loss": 2.1351,
"step": 622
},
{
"epoch": 0.4053350683148991,
"grad_norm": 0.18059247732162476,
"learning_rate": 0.0001,
"loss": 1.9451,
"step": 623
},
{
"epoch": 0.405985686402082,
"grad_norm": 0.17532669007778168,
"learning_rate": 0.0001,
"loss": 1.8591,
"step": 624
},
{
"epoch": 0.4066363044892648,
"grad_norm": 0.24097976088523865,
"learning_rate": 0.0001,
"loss": 2.6534,
"step": 625
},
{
"epoch": 0.4072869225764476,
"grad_norm": 0.19505445659160614,
"learning_rate": 0.0001,
"loss": 1.8952,
"step": 626
},
{
"epoch": 0.40793754066363047,
"grad_norm": 0.232722207903862,
"learning_rate": 0.0001,
"loss": 2.2055,
"step": 627
},
{
"epoch": 0.4085881587508133,
"grad_norm": 0.23899732530117035,
"learning_rate": 0.0001,
"loss": 2.5848,
"step": 628
},
{
"epoch": 0.4092387768379961,
"grad_norm": 0.2411729097366333,
"learning_rate": 0.0001,
"loss": 2.5315,
"step": 629
},
{
"epoch": 0.4098893949251789,
"grad_norm": 0.25042012333869934,
"learning_rate": 0.0001,
"loss": 2.4154,
"step": 630
},
{
"epoch": 0.41054001301236176,
"grad_norm": 0.2764488160610199,
"learning_rate": 0.0001,
"loss": 2.0564,
"step": 631
},
{
"epoch": 0.41119063109954457,
"grad_norm": 0.24761155247688293,
"learning_rate": 0.0001,
"loss": 2.3245,
"step": 632
},
{
"epoch": 0.4118412491867274,
"grad_norm": 0.22376200556755066,
"learning_rate": 0.0001,
"loss": 2.1881,
"step": 633
},
{
"epoch": 0.41249186727391024,
"grad_norm": 0.19060148298740387,
"learning_rate": 0.0001,
"loss": 1.9588,
"step": 634
},
{
"epoch": 0.41314248536109305,
"grad_norm": 0.4157400131225586,
"learning_rate": 0.0001,
"loss": 2.9024,
"step": 635
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.2557002007961273,
"learning_rate": 0.0001,
"loss": 1.9819,
"step": 636
},
{
"epoch": 0.41444372153545866,
"grad_norm": 0.2908417880535126,
"learning_rate": 0.0001,
"loss": 2.112,
"step": 637
},
{
"epoch": 0.41509433962264153,
"grad_norm": 0.32937270402908325,
"learning_rate": 0.0001,
"loss": 2.4976,
"step": 638
},
{
"epoch": 0.41574495770982434,
"grad_norm": 0.20382268726825714,
"learning_rate": 0.0001,
"loss": 2.0448,
"step": 639
},
{
"epoch": 0.41639557579700714,
"grad_norm": 0.23484939336776733,
"learning_rate": 0.0001,
"loss": 1.9514,
"step": 640
},
{
"epoch": 0.41704619388418995,
"grad_norm": 0.23023058474063873,
"learning_rate": 0.0001,
"loss": 2.0768,
"step": 641
},
{
"epoch": 0.4176968119713728,
"grad_norm": 0.22951190173625946,
"learning_rate": 0.0001,
"loss": 2.0764,
"step": 642
},
{
"epoch": 0.4183474300585556,
"grad_norm": 0.18971513211727142,
"learning_rate": 0.0001,
"loss": 1.9693,
"step": 643
},
{
"epoch": 0.41899804814573843,
"grad_norm": 0.24955709278583527,
"learning_rate": 0.0001,
"loss": 2.4898,
"step": 644
},
{
"epoch": 0.4196486662329213,
"grad_norm": 0.3344306945800781,
"learning_rate": 0.0001,
"loss": 2.4779,
"step": 645
},
{
"epoch": 0.4202992843201041,
"grad_norm": 0.21661825478076935,
"learning_rate": 0.0001,
"loss": 2.0472,
"step": 646
},
{
"epoch": 0.4209499024072869,
"grad_norm": 0.1972419023513794,
"learning_rate": 0.0001,
"loss": 2.1712,
"step": 647
},
{
"epoch": 0.4216005204944697,
"grad_norm": 0.21619470417499542,
"learning_rate": 0.0001,
"loss": 2.0739,
"step": 648
},
{
"epoch": 0.4222511385816526,
"grad_norm": 0.2329091727733612,
"learning_rate": 0.0001,
"loss": 2.1362,
"step": 649
},
{
"epoch": 0.4229017566688354,
"grad_norm": 0.22971969842910767,
"learning_rate": 0.0001,
"loss": 1.9898,
"step": 650
},
{
"epoch": 0.4235523747560182,
"grad_norm": 0.20185063779354095,
"learning_rate": 0.0001,
"loss": 2.1008,
"step": 651
},
{
"epoch": 0.42420299284320107,
"grad_norm": 0.2658546566963196,
"learning_rate": 0.0001,
"loss": 2.5734,
"step": 652
},
{
"epoch": 0.4248536109303839,
"grad_norm": 0.23109374940395355,
"learning_rate": 0.0001,
"loss": 2.2569,
"step": 653
},
{
"epoch": 0.4255042290175667,
"grad_norm": 0.25115352869033813,
"learning_rate": 0.0001,
"loss": 2.5967,
"step": 654
},
{
"epoch": 0.4261548471047495,
"grad_norm": 0.20470669865608215,
"learning_rate": 0.0001,
"loss": 2.0302,
"step": 655
},
{
"epoch": 0.42680546519193235,
"grad_norm": 0.2151513546705246,
"learning_rate": 0.0001,
"loss": 2.5183,
"step": 656
},
{
"epoch": 0.42745608327911516,
"grad_norm": 0.2571411728858948,
"learning_rate": 0.0001,
"loss": 2.255,
"step": 657
},
{
"epoch": 0.42810670136629797,
"grad_norm": 0.2414022833108902,
"learning_rate": 0.0001,
"loss": 2.4076,
"step": 658
},
{
"epoch": 0.42875731945348083,
"grad_norm": 0.21041014790534973,
"learning_rate": 0.0001,
"loss": 2.0091,
"step": 659
},
{
"epoch": 0.42940793754066364,
"grad_norm": 0.21241822838783264,
"learning_rate": 0.0001,
"loss": 2.355,
"step": 660
},
{
"epoch": 0.43005855562784645,
"grad_norm": 0.21031403541564941,
"learning_rate": 0.0001,
"loss": 1.9887,
"step": 661
},
{
"epoch": 0.43070917371502926,
"grad_norm": 0.19765952229499817,
"learning_rate": 0.0001,
"loss": 2.1555,
"step": 662
},
{
"epoch": 0.4313597918022121,
"grad_norm": 0.24740834534168243,
"learning_rate": 0.0001,
"loss": 2.2349,
"step": 663
},
{
"epoch": 0.43201040988939493,
"grad_norm": 0.22086234390735626,
"learning_rate": 0.0001,
"loss": 2.0948,
"step": 664
},
{
"epoch": 0.43266102797657774,
"grad_norm": 0.21949239075183868,
"learning_rate": 0.0001,
"loss": 2.3905,
"step": 665
},
{
"epoch": 0.43331164606376055,
"grad_norm": 0.20536834001541138,
"learning_rate": 0.0001,
"loss": 2.0547,
"step": 666
},
{
"epoch": 0.4339622641509434,
"grad_norm": 0.2570655941963196,
"learning_rate": 0.0001,
"loss": 2.0261,
"step": 667
},
{
"epoch": 0.4346128822381262,
"grad_norm": 0.3293687701225281,
"learning_rate": 0.0001,
"loss": 2.344,
"step": 668
},
{
"epoch": 0.435263500325309,
"grad_norm": 0.22947120666503906,
"learning_rate": 0.0001,
"loss": 2.232,
"step": 669
},
{
"epoch": 0.4359141184124919,
"grad_norm": 0.2425599992275238,
"learning_rate": 0.0001,
"loss": 2.309,
"step": 670
},
{
"epoch": 0.4365647364996747,
"grad_norm": 0.2506352663040161,
"learning_rate": 0.0001,
"loss": 2.1249,
"step": 671
},
{
"epoch": 0.4372153545868575,
"grad_norm": 0.19457192718982697,
"learning_rate": 0.0001,
"loss": 1.9461,
"step": 672
},
{
"epoch": 0.4378659726740403,
"grad_norm": 0.3749271035194397,
"learning_rate": 0.0001,
"loss": 2.8532,
"step": 673
},
{
"epoch": 0.4385165907612232,
"grad_norm": 0.25384366512298584,
"learning_rate": 0.0001,
"loss": 2.6495,
"step": 674
},
{
"epoch": 0.439167208848406,
"grad_norm": 0.21413469314575195,
"learning_rate": 0.0001,
"loss": 2.084,
"step": 675
},
{
"epoch": 0.4398178269355888,
"grad_norm": 0.228125661611557,
"learning_rate": 0.0001,
"loss": 2.2175,
"step": 676
},
{
"epoch": 0.44046844502277166,
"grad_norm": 0.1948491632938385,
"learning_rate": 0.0001,
"loss": 1.9702,
"step": 677
},
{
"epoch": 0.44111906310995447,
"grad_norm": 0.307992547750473,
"learning_rate": 0.0001,
"loss": 2.5884,
"step": 678
},
{
"epoch": 0.4417696811971373,
"grad_norm": 0.23681728541851044,
"learning_rate": 0.0001,
"loss": 2.2104,
"step": 679
},
{
"epoch": 0.4424202992843201,
"grad_norm": 0.23185166716575623,
"learning_rate": 0.0001,
"loss": 2.0823,
"step": 680
},
{
"epoch": 0.44307091737150295,
"grad_norm": 0.2772667109966278,
"learning_rate": 0.0001,
"loss": 2.3729,
"step": 681
},
{
"epoch": 0.44372153545868576,
"grad_norm": 0.18908965587615967,
"learning_rate": 0.0001,
"loss": 2.0585,
"step": 682
},
{
"epoch": 0.44437215354586856,
"grad_norm": 0.2063988745212555,
"learning_rate": 0.0001,
"loss": 1.9474,
"step": 683
},
{
"epoch": 0.4450227716330514,
"grad_norm": 0.19444917142391205,
"learning_rate": 0.0001,
"loss": 1.9269,
"step": 684
},
{
"epoch": 0.44567338972023424,
"grad_norm": 0.2866727113723755,
"learning_rate": 0.0001,
"loss": 2.5145,
"step": 685
},
{
"epoch": 0.44632400780741704,
"grad_norm": 0.24801641702651978,
"learning_rate": 0.0001,
"loss": 2.2954,
"step": 686
},
{
"epoch": 0.44697462589459985,
"grad_norm": 0.2115658074617386,
"learning_rate": 0.0001,
"loss": 2.1956,
"step": 687
},
{
"epoch": 0.4476252439817827,
"grad_norm": 0.3155558109283447,
"learning_rate": 0.0001,
"loss": 2.7396,
"step": 688
},
{
"epoch": 0.4482758620689655,
"grad_norm": 0.22418133914470673,
"learning_rate": 0.0001,
"loss": 2.1066,
"step": 689
},
{
"epoch": 0.44892648015614833,
"grad_norm": 0.2707614600658417,
"learning_rate": 0.0001,
"loss": 2.3353,
"step": 690
},
{
"epoch": 0.44957709824333114,
"grad_norm": 0.22262880206108093,
"learning_rate": 0.0001,
"loss": 2.2143,
"step": 691
},
{
"epoch": 0.450227716330514,
"grad_norm": 0.25256767868995667,
"learning_rate": 0.0001,
"loss": 2.2786,
"step": 692
},
{
"epoch": 0.4508783344176968,
"grad_norm": 0.20360921323299408,
"learning_rate": 0.0001,
"loss": 2.0059,
"step": 693
},
{
"epoch": 0.4515289525048796,
"grad_norm": 0.20573420822620392,
"learning_rate": 0.0001,
"loss": 2.0884,
"step": 694
},
{
"epoch": 0.4521795705920625,
"grad_norm": 0.31812623143196106,
"learning_rate": 0.0001,
"loss": 2.5905,
"step": 695
},
{
"epoch": 0.4528301886792453,
"grad_norm": 0.24690969288349152,
"learning_rate": 0.0001,
"loss": 2.5157,
"step": 696
},
{
"epoch": 0.4534808067664281,
"grad_norm": 0.256793737411499,
"learning_rate": 0.0001,
"loss": 2.1548,
"step": 697
},
{
"epoch": 0.4541314248536109,
"grad_norm": 0.2659960985183716,
"learning_rate": 0.0001,
"loss": 2.2977,
"step": 698
},
{
"epoch": 0.4547820429407938,
"grad_norm": 0.23824195563793182,
"learning_rate": 0.0001,
"loss": 2.5946,
"step": 699
},
{
"epoch": 0.4554326610279766,
"grad_norm": 0.2580608129501343,
"learning_rate": 0.0001,
"loss": 2.2608,
"step": 700
},
{
"epoch": 0.4560832791151594,
"grad_norm": 0.270622193813324,
"learning_rate": 0.0001,
"loss": 2.5848,
"step": 701
},
{
"epoch": 0.4567338972023422,
"grad_norm": 0.2170489877462387,
"learning_rate": 0.0001,
"loss": 2.4315,
"step": 702
},
{
"epoch": 0.45738451528952506,
"grad_norm": 0.20716050267219543,
"learning_rate": 0.0001,
"loss": 2.1592,
"step": 703
},
{
"epoch": 0.45803513337670787,
"grad_norm": 0.24847671389579773,
"learning_rate": 0.0001,
"loss": 2.3202,
"step": 704
},
{
"epoch": 0.4586857514638907,
"grad_norm": 0.24049146473407745,
"learning_rate": 0.0001,
"loss": 2.1968,
"step": 705
},
{
"epoch": 0.45933636955107354,
"grad_norm": 0.2079533487558365,
"learning_rate": 0.0001,
"loss": 2.2966,
"step": 706
},
{
"epoch": 0.45998698763825635,
"grad_norm": 0.18255428969860077,
"learning_rate": 0.0001,
"loss": 1.9931,
"step": 707
},
{
"epoch": 0.46063760572543916,
"grad_norm": 0.28015655279159546,
"learning_rate": 0.0001,
"loss": 2.2605,
"step": 708
},
{
"epoch": 0.46128822381262197,
"grad_norm": 0.27453094720840454,
"learning_rate": 0.0001,
"loss": 2.2835,
"step": 709
},
{
"epoch": 0.46193884189980483,
"grad_norm": 0.2751506268978119,
"learning_rate": 0.0001,
"loss": 2.665,
"step": 710
},
{
"epoch": 0.46258945998698764,
"grad_norm": 0.2759210169315338,
"learning_rate": 0.0001,
"loss": 2.3593,
"step": 711
},
{
"epoch": 0.46324007807417045,
"grad_norm": 0.2902829051017761,
"learning_rate": 0.0001,
"loss": 2.7421,
"step": 712
},
{
"epoch": 0.4638906961613533,
"grad_norm": 0.24083854258060455,
"learning_rate": 0.0001,
"loss": 2.4644,
"step": 713
},
{
"epoch": 0.4645413142485361,
"grad_norm": 0.23614934086799622,
"learning_rate": 0.0001,
"loss": 2.2939,
"step": 714
},
{
"epoch": 0.4651919323357189,
"grad_norm": 0.1972537487745285,
"learning_rate": 0.0001,
"loss": 1.9391,
"step": 715
},
{
"epoch": 0.46584255042290174,
"grad_norm": 0.2227838933467865,
"learning_rate": 0.0001,
"loss": 1.9396,
"step": 716
},
{
"epoch": 0.4664931685100846,
"grad_norm": 0.3672918379306793,
"learning_rate": 0.0001,
"loss": 2.7508,
"step": 717
},
{
"epoch": 0.4671437865972674,
"grad_norm": 0.2712246775627136,
"learning_rate": 0.0001,
"loss": 2.2838,
"step": 718
},
{
"epoch": 0.4677944046844502,
"grad_norm": 0.2337927669286728,
"learning_rate": 0.0001,
"loss": 1.9807,
"step": 719
},
{
"epoch": 0.468445022771633,
"grad_norm": 0.2051180601119995,
"learning_rate": 0.0001,
"loss": 2.0311,
"step": 720
},
{
"epoch": 0.4690956408588159,
"grad_norm": 0.1965889185667038,
"learning_rate": 0.0001,
"loss": 2.1114,
"step": 721
},
{
"epoch": 0.4697462589459987,
"grad_norm": 0.2106337547302246,
"learning_rate": 0.0001,
"loss": 2.0792,
"step": 722
},
{
"epoch": 0.4703968770331815,
"grad_norm": 0.19918356835842133,
"learning_rate": 0.0001,
"loss": 2.1323,
"step": 723
},
{
"epoch": 0.47104749512036437,
"grad_norm": 0.20124401152133942,
"learning_rate": 0.0001,
"loss": 2.0008,
"step": 724
},
{
"epoch": 0.4716981132075472,
"grad_norm": 0.2172473967075348,
"learning_rate": 0.0001,
"loss": 2.3891,
"step": 725
},
{
"epoch": 0.47234873129473,
"grad_norm": 0.2524811029434204,
"learning_rate": 0.0001,
"loss": 2.3343,
"step": 726
},
{
"epoch": 0.4729993493819128,
"grad_norm": 0.22882957756519318,
"learning_rate": 0.0001,
"loss": 2.6723,
"step": 727
},
{
"epoch": 0.47364996746909566,
"grad_norm": 0.2434161901473999,
"learning_rate": 0.0001,
"loss": 1.9549,
"step": 728
},
{
"epoch": 0.47430058555627846,
"grad_norm": 0.19140364229679108,
"learning_rate": 0.0001,
"loss": 2.0468,
"step": 729
},
{
"epoch": 0.4749512036434613,
"grad_norm": 0.22166937589645386,
"learning_rate": 0.0001,
"loss": 2.3432,
"step": 730
},
{
"epoch": 0.47560182173064414,
"grad_norm": 0.2005748748779297,
"learning_rate": 0.0001,
"loss": 2.0616,
"step": 731
},
{
"epoch": 0.47625243981782694,
"grad_norm": 0.3115980923175812,
"learning_rate": 0.0001,
"loss": 2.6153,
"step": 732
},
{
"epoch": 0.47690305790500975,
"grad_norm": 0.27135169506073,
"learning_rate": 0.0001,
"loss": 2.3225,
"step": 733
},
{
"epoch": 0.47755367599219256,
"grad_norm": 0.20748727023601532,
"learning_rate": 0.0001,
"loss": 1.834,
"step": 734
},
{
"epoch": 0.4782042940793754,
"grad_norm": 0.4031495153903961,
"learning_rate": 0.0001,
"loss": 2.8177,
"step": 735
},
{
"epoch": 0.47885491216655823,
"grad_norm": 0.2978368401527405,
"learning_rate": 0.0001,
"loss": 2.6178,
"step": 736
},
{
"epoch": 0.47950553025374104,
"grad_norm": 0.3466270864009857,
"learning_rate": 0.0001,
"loss": 2.6031,
"step": 737
},
{
"epoch": 0.4801561483409239,
"grad_norm": 0.20074127614498138,
"learning_rate": 0.0001,
"loss": 2.247,
"step": 738
},
{
"epoch": 0.4808067664281067,
"grad_norm": 0.2393479198217392,
"learning_rate": 0.0001,
"loss": 2.1265,
"step": 739
},
{
"epoch": 0.4814573845152895,
"grad_norm": 0.27758634090423584,
"learning_rate": 0.0001,
"loss": 2.5025,
"step": 740
},
{
"epoch": 0.48210800260247233,
"grad_norm": 0.20123820006847382,
"learning_rate": 0.0001,
"loss": 2.0083,
"step": 741
},
{
"epoch": 0.4827586206896552,
"grad_norm": 0.19012506306171417,
"learning_rate": 0.0001,
"loss": 2.0212,
"step": 742
},
{
"epoch": 0.483409238776838,
"grad_norm": 0.19451047480106354,
"learning_rate": 0.0001,
"loss": 2.0295,
"step": 743
},
{
"epoch": 0.4840598568640208,
"grad_norm": 0.3339052200317383,
"learning_rate": 0.0001,
"loss": 2.4813,
"step": 744
},
{
"epoch": 0.4847104749512036,
"grad_norm": 0.2646152973175049,
"learning_rate": 0.0001,
"loss": 2.4302,
"step": 745
},
{
"epoch": 0.4853610930383865,
"grad_norm": 0.23590324819087982,
"learning_rate": 0.0001,
"loss": 2.1723,
"step": 746
},
{
"epoch": 0.4860117111255693,
"grad_norm": 0.28924039006233215,
"learning_rate": 0.0001,
"loss": 2.8005,
"step": 747
},
{
"epoch": 0.4866623292127521,
"grad_norm": 0.21145464479923248,
"learning_rate": 0.0001,
"loss": 2.3501,
"step": 748
},
{
"epoch": 0.48731294729993496,
"grad_norm": 0.22815656661987305,
"learning_rate": 0.0001,
"loss": 2.1997,
"step": 749
},
{
"epoch": 0.48796356538711777,
"grad_norm": 0.24325215816497803,
"learning_rate": 0.0001,
"loss": 2.039,
"step": 750
},
{
"epoch": 0.4886141834743006,
"grad_norm": 0.3235335052013397,
"learning_rate": 0.0001,
"loss": 2.4533,
"step": 751
},
{
"epoch": 0.4892648015614834,
"grad_norm": 0.25513559579849243,
"learning_rate": 0.0001,
"loss": 2.3779,
"step": 752
},
{
"epoch": 0.48991541964866625,
"grad_norm": 0.2905427813529968,
"learning_rate": 0.0001,
"loss": 1.9843,
"step": 753
},
{
"epoch": 0.49056603773584906,
"grad_norm": 0.23760183155536652,
"learning_rate": 0.0001,
"loss": 2.1825,
"step": 754
},
{
"epoch": 0.49121665582303187,
"grad_norm": 0.2170071303844452,
"learning_rate": 0.0001,
"loss": 1.9877,
"step": 755
},
{
"epoch": 0.49186727391021473,
"grad_norm": 0.2555190920829773,
"learning_rate": 0.0001,
"loss": 2.457,
"step": 756
},
{
"epoch": 0.49251789199739754,
"grad_norm": 0.2571033835411072,
"learning_rate": 0.0001,
"loss": 2.1152,
"step": 757
},
{
"epoch": 0.49316851008458035,
"grad_norm": 0.23969238996505737,
"learning_rate": 0.0001,
"loss": 2.3439,
"step": 758
},
{
"epoch": 0.49381912817176316,
"grad_norm": 0.1900262087583542,
"learning_rate": 0.0001,
"loss": 1.8999,
"step": 759
},
{
"epoch": 0.494469746258946,
"grad_norm": 0.19621430337429047,
"learning_rate": 0.0001,
"loss": 2.0658,
"step": 760
},
{
"epoch": 0.4951203643461288,
"grad_norm": 0.21956481039524078,
"learning_rate": 0.0001,
"loss": 2.5427,
"step": 761
},
{
"epoch": 0.49577098243331164,
"grad_norm": 0.22567258775234222,
"learning_rate": 0.0001,
"loss": 2.2777,
"step": 762
},
{
"epoch": 0.49642160052049444,
"grad_norm": 0.20233570039272308,
"learning_rate": 0.0001,
"loss": 2.0342,
"step": 763
},
{
"epoch": 0.4970722186076773,
"grad_norm": 0.23662947118282318,
"learning_rate": 0.0001,
"loss": 2.3668,
"step": 764
},
{
"epoch": 0.4977228366948601,
"grad_norm": 0.2625278830528259,
"learning_rate": 0.0001,
"loss": 2.6536,
"step": 765
},
{
"epoch": 0.4983734547820429,
"grad_norm": 0.23235228657722473,
"learning_rate": 0.0001,
"loss": 2.1891,
"step": 766
},
{
"epoch": 0.4990240728692258,
"grad_norm": 0.19439217448234558,
"learning_rate": 0.0001,
"loss": 1.9647,
"step": 767
},
{
"epoch": 0.4996746909564086,
"grad_norm": 0.19810114800930023,
"learning_rate": 0.0001,
"loss": 1.9965,
"step": 768
},
{
"epoch": 0.5003253090435914,
"grad_norm": 0.2525380253791809,
"learning_rate": 0.0001,
"loss": 2.2444,
"step": 769
},
{
"epoch": 0.5009759271307742,
"grad_norm": 0.2409314513206482,
"learning_rate": 0.0001,
"loss": 2.1717,
"step": 770
},
{
"epoch": 0.501626545217957,
"grad_norm": 0.25244686007499695,
"learning_rate": 0.0001,
"loss": 2.0126,
"step": 771
},
{
"epoch": 0.5022771633051398,
"grad_norm": 0.19767141342163086,
"learning_rate": 0.0001,
"loss": 2.1384,
"step": 772
},
{
"epoch": 0.5029277813923227,
"grad_norm": 0.39446812868118286,
"learning_rate": 0.0001,
"loss": 2.8039,
"step": 773
},
{
"epoch": 0.5035783994795056,
"grad_norm": 0.2643390893936157,
"learning_rate": 0.0001,
"loss": 2.1524,
"step": 774
},
{
"epoch": 0.5042290175666884,
"grad_norm": 0.27606508135795593,
"learning_rate": 0.0001,
"loss": 2.1802,
"step": 775
},
{
"epoch": 0.5048796356538712,
"grad_norm": 0.364106148481369,
"learning_rate": 0.0001,
"loss": 2.9694,
"step": 776
},
{
"epoch": 0.505530253741054,
"grad_norm": 0.23091645538806915,
"learning_rate": 0.0001,
"loss": 2.5471,
"step": 777
},
{
"epoch": 0.5061808718282368,
"grad_norm": 0.19318193197250366,
"learning_rate": 0.0001,
"loss": 2.2082,
"step": 778
},
{
"epoch": 0.5068314899154196,
"grad_norm": 0.28997862339019775,
"learning_rate": 0.0001,
"loss": 2.4399,
"step": 779
},
{
"epoch": 0.5074821080026025,
"grad_norm": 0.22487197816371918,
"learning_rate": 0.0001,
"loss": 2.1946,
"step": 780
},
{
"epoch": 0.5081327260897853,
"grad_norm": 0.24430596828460693,
"learning_rate": 0.0001,
"loss": 2.4456,
"step": 781
},
{
"epoch": 0.5087833441769681,
"grad_norm": 0.21677151322364807,
"learning_rate": 0.0001,
"loss": 2.2082,
"step": 782
},
{
"epoch": 0.5094339622641509,
"grad_norm": 0.47995632886886597,
"learning_rate": 0.0001,
"loss": 3.1358,
"step": 783
},
{
"epoch": 0.5100845803513337,
"grad_norm": 0.19044414162635803,
"learning_rate": 0.0001,
"loss": 1.8924,
"step": 784
},
{
"epoch": 0.5107351984385166,
"grad_norm": 0.19143608212471008,
"learning_rate": 0.0001,
"loss": 2.0459,
"step": 785
},
{
"epoch": 0.5113858165256994,
"grad_norm": 0.22588413953781128,
"learning_rate": 0.0001,
"loss": 2.1369,
"step": 786
},
{
"epoch": 0.5120364346128823,
"grad_norm": 0.2786167860031128,
"learning_rate": 0.0001,
"loss": 2.2029,
"step": 787
},
{
"epoch": 0.5126870527000651,
"grad_norm": 0.24471627175807953,
"learning_rate": 0.0001,
"loss": 2.1248,
"step": 788
},
{
"epoch": 0.5133376707872479,
"grad_norm": 0.17795225977897644,
"learning_rate": 0.0001,
"loss": 1.7926,
"step": 789
},
{
"epoch": 0.5139882888744307,
"grad_norm": 0.2173709124326706,
"learning_rate": 0.0001,
"loss": 2.0538,
"step": 790
},
{
"epoch": 0.5146389069616135,
"grad_norm": 0.2027692049741745,
"learning_rate": 0.0001,
"loss": 1.8568,
"step": 791
},
{
"epoch": 0.5152895250487963,
"grad_norm": 0.2013595849275589,
"learning_rate": 0.0001,
"loss": 2.0501,
"step": 792
},
{
"epoch": 0.5159401431359791,
"grad_norm": 0.21996662020683289,
"learning_rate": 0.0001,
"loss": 2.0374,
"step": 793
},
{
"epoch": 0.516590761223162,
"grad_norm": 0.21435722708702087,
"learning_rate": 0.0001,
"loss": 2.1907,
"step": 794
},
{
"epoch": 0.5172413793103449,
"grad_norm": 0.21512284874916077,
"learning_rate": 0.0001,
"loss": 2.315,
"step": 795
},
{
"epoch": 0.5178919973975277,
"grad_norm": 0.19432400166988373,
"learning_rate": 0.0001,
"loss": 2.103,
"step": 796
},
{
"epoch": 0.5185426154847105,
"grad_norm": 0.23112992942333221,
"learning_rate": 0.0001,
"loss": 2.328,
"step": 797
},
{
"epoch": 0.5191932335718933,
"grad_norm": 0.19719737768173218,
"learning_rate": 0.0001,
"loss": 1.9569,
"step": 798
},
{
"epoch": 0.5198438516590761,
"grad_norm": 0.2115892618894577,
"learning_rate": 0.0001,
"loss": 2.2533,
"step": 799
},
{
"epoch": 0.5204944697462589,
"grad_norm": 0.24321842193603516,
"learning_rate": 0.0001,
"loss": 2.6597,
"step": 800
},
{
"epoch": 0.5211450878334418,
"grad_norm": 0.18219350278377533,
"learning_rate": 0.0001,
"loss": 1.8709,
"step": 801
},
{
"epoch": 0.5217957059206246,
"grad_norm": 0.18715021014213562,
"learning_rate": 0.0001,
"loss": 2.0021,
"step": 802
},
{
"epoch": 0.5224463240078074,
"grad_norm": 0.25940024852752686,
"learning_rate": 0.0001,
"loss": 2.3742,
"step": 803
},
{
"epoch": 0.5230969420949902,
"grad_norm": 0.18714728951454163,
"learning_rate": 0.0001,
"loss": 2.211,
"step": 804
},
{
"epoch": 0.523747560182173,
"grad_norm": 0.20145951211452484,
"learning_rate": 0.0001,
"loss": 2.0047,
"step": 805
},
{
"epoch": 0.5243981782693559,
"grad_norm": 0.18992845714092255,
"learning_rate": 0.0001,
"loss": 1.8559,
"step": 806
},
{
"epoch": 0.5250487963565387,
"grad_norm": 0.2682324945926666,
"learning_rate": 0.0001,
"loss": 2.4791,
"step": 807
},
{
"epoch": 0.5256994144437215,
"grad_norm": 0.33034664392471313,
"learning_rate": 0.0001,
"loss": 2.3089,
"step": 808
},
{
"epoch": 0.5263500325309044,
"grad_norm": 0.18838956952095032,
"learning_rate": 0.0001,
"loss": 1.9462,
"step": 809
},
{
"epoch": 0.5270006506180872,
"grad_norm": 0.42872169613838196,
"learning_rate": 0.0001,
"loss": 2.6874,
"step": 810
},
{
"epoch": 0.52765126870527,
"grad_norm": 0.2108643501996994,
"learning_rate": 0.0001,
"loss": 2.3627,
"step": 811
},
{
"epoch": 0.5283018867924528,
"grad_norm": 0.21745599806308746,
"learning_rate": 0.0001,
"loss": 2.1204,
"step": 812
},
{
"epoch": 0.5289525048796356,
"grad_norm": 0.2577585279941559,
"learning_rate": 0.0001,
"loss": 1.9746,
"step": 813
},
{
"epoch": 0.5296031229668184,
"grad_norm": 0.372471421957016,
"learning_rate": 0.0001,
"loss": 2.688,
"step": 814
},
{
"epoch": 0.5302537410540012,
"grad_norm": 0.2425181120634079,
"learning_rate": 0.0001,
"loss": 2.1377,
"step": 815
},
{
"epoch": 0.5309043591411842,
"grad_norm": 0.2638307511806488,
"learning_rate": 0.0001,
"loss": 2.1088,
"step": 816
},
{
"epoch": 0.531554977228367,
"grad_norm": 0.2356933355331421,
"learning_rate": 0.0001,
"loss": 2.2291,
"step": 817
},
{
"epoch": 0.5322055953155498,
"grad_norm": 0.23714864253997803,
"learning_rate": 0.0001,
"loss": 2.0929,
"step": 818
},
{
"epoch": 0.5328562134027326,
"grad_norm": 0.19541950523853302,
"learning_rate": 0.0001,
"loss": 2.0883,
"step": 819
},
{
"epoch": 0.5335068314899154,
"grad_norm": 0.3091617822647095,
"learning_rate": 0.0001,
"loss": 3.0127,
"step": 820
},
{
"epoch": 0.5341574495770982,
"grad_norm": 0.2592740058898926,
"learning_rate": 0.0001,
"loss": 1.8307,
"step": 821
},
{
"epoch": 0.534808067664281,
"grad_norm": 0.22505807876586914,
"learning_rate": 0.0001,
"loss": 2.462,
"step": 822
},
{
"epoch": 0.5354586857514639,
"grad_norm": 0.22032824158668518,
"learning_rate": 0.0001,
"loss": 2.2718,
"step": 823
},
{
"epoch": 0.5361093038386467,
"grad_norm": 0.2457459270954132,
"learning_rate": 0.0001,
"loss": 2.4213,
"step": 824
},
{
"epoch": 0.5367599219258296,
"grad_norm": 0.24181683361530304,
"learning_rate": 0.0001,
"loss": 1.9347,
"step": 825
},
{
"epoch": 0.5374105400130124,
"grad_norm": 0.29988738894462585,
"learning_rate": 0.0001,
"loss": 2.7697,
"step": 826
},
{
"epoch": 0.5380611581001952,
"grad_norm": 0.24946388602256775,
"learning_rate": 0.0001,
"loss": 2.2117,
"step": 827
},
{
"epoch": 0.538711776187378,
"grad_norm": 0.20339331030845642,
"learning_rate": 0.0001,
"loss": 1.9936,
"step": 828
},
{
"epoch": 0.5393623942745608,
"grad_norm": 0.22250457108020782,
"learning_rate": 0.0001,
"loss": 2.0785,
"step": 829
},
{
"epoch": 0.5400130123617437,
"grad_norm": 0.1869298666715622,
"learning_rate": 0.0001,
"loss": 2.0406,
"step": 830
},
{
"epoch": 0.5406636304489265,
"grad_norm": 0.1873755156993866,
"learning_rate": 0.0001,
"loss": 1.9126,
"step": 831
},
{
"epoch": 0.5413142485361093,
"grad_norm": 0.3135535418987274,
"learning_rate": 0.0001,
"loss": 2.2881,
"step": 832
},
{
"epoch": 0.5419648666232921,
"grad_norm": 0.20596185326576233,
"learning_rate": 0.0001,
"loss": 2.0682,
"step": 833
},
{
"epoch": 0.5426154847104749,
"grad_norm": 0.25786712765693665,
"learning_rate": 0.0001,
"loss": 2.0591,
"step": 834
},
{
"epoch": 0.5432661027976577,
"grad_norm": 0.2592066824436188,
"learning_rate": 0.0001,
"loss": 2.052,
"step": 835
},
{
"epoch": 0.5439167208848406,
"grad_norm": 0.20738951861858368,
"learning_rate": 0.0001,
"loss": 1.9726,
"step": 836
},
{
"epoch": 0.5445673389720235,
"grad_norm": 0.21384763717651367,
"learning_rate": 0.0001,
"loss": 2.1897,
"step": 837
},
{
"epoch": 0.5452179570592063,
"grad_norm": 0.22050943970680237,
"learning_rate": 0.0001,
"loss": 2.3597,
"step": 838
},
{
"epoch": 0.5458685751463891,
"grad_norm": 0.1996280699968338,
"learning_rate": 0.0001,
"loss": 2.0492,
"step": 839
},
{
"epoch": 0.5465191932335719,
"grad_norm": 0.2430533468723297,
"learning_rate": 0.0001,
"loss": 2.2774,
"step": 840
},
{
"epoch": 0.5471698113207547,
"grad_norm": 0.22777177393436432,
"learning_rate": 0.0001,
"loss": 2.0779,
"step": 841
},
{
"epoch": 0.5478204294079375,
"grad_norm": 0.22464539110660553,
"learning_rate": 0.0001,
"loss": 2.3316,
"step": 842
},
{
"epoch": 0.5484710474951203,
"grad_norm": 0.17759400606155396,
"learning_rate": 0.0001,
"loss": 1.8407,
"step": 843
},
{
"epoch": 0.5491216655823032,
"grad_norm": 0.22264355421066284,
"learning_rate": 0.0001,
"loss": 2.2869,
"step": 844
},
{
"epoch": 0.549772283669486,
"grad_norm": 0.20819737017154694,
"learning_rate": 0.0001,
"loss": 2.1209,
"step": 845
},
{
"epoch": 0.5504229017566689,
"grad_norm": 0.2194463461637497,
"learning_rate": 0.0001,
"loss": 2.1457,
"step": 846
},
{
"epoch": 0.5510735198438517,
"grad_norm": 0.19314661622047424,
"learning_rate": 0.0001,
"loss": 2.1063,
"step": 847
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.186354860663414,
"learning_rate": 0.0001,
"loss": 2.0833,
"step": 848
},
{
"epoch": 0.5523747560182173,
"grad_norm": 0.1862732619047165,
"learning_rate": 0.0001,
"loss": 1.9441,
"step": 849
},
{
"epoch": 0.5530253741054001,
"grad_norm": 0.24664181470870972,
"learning_rate": 0.0001,
"loss": 2.3277,
"step": 850
},
{
"epoch": 0.5536759921925829,
"grad_norm": 0.20182165503501892,
"learning_rate": 0.0001,
"loss": 2.1902,
"step": 851
},
{
"epoch": 0.5543266102797658,
"grad_norm": 0.2108999788761139,
"learning_rate": 0.0001,
"loss": 2.0826,
"step": 852
},
{
"epoch": 0.5549772283669486,
"grad_norm": 0.25388890504837036,
"learning_rate": 0.0001,
"loss": 2.5149,
"step": 853
},
{
"epoch": 0.5556278464541314,
"grad_norm": 0.2074718177318573,
"learning_rate": 0.0001,
"loss": 1.9135,
"step": 854
},
{
"epoch": 0.5562784645413142,
"grad_norm": 0.1992723047733307,
"learning_rate": 0.0001,
"loss": 2.186,
"step": 855
},
{
"epoch": 0.556929082628497,
"grad_norm": 0.18721085786819458,
"learning_rate": 0.0001,
"loss": 1.9453,
"step": 856
},
{
"epoch": 0.5575797007156799,
"grad_norm": 0.21606992185115814,
"learning_rate": 0.0001,
"loss": 2.1703,
"step": 857
},
{
"epoch": 0.5582303188028627,
"grad_norm": 0.2854723334312439,
"learning_rate": 0.0001,
"loss": 2.9538,
"step": 858
},
{
"epoch": 0.5588809368900456,
"grad_norm": 0.21503040194511414,
"learning_rate": 0.0001,
"loss": 2.0194,
"step": 859
},
{
"epoch": 0.5595315549772284,
"grad_norm": 0.2690679430961609,
"learning_rate": 0.0001,
"loss": 2.1562,
"step": 860
},
{
"epoch": 0.5601821730644112,
"grad_norm": 0.2811613976955414,
"learning_rate": 0.0001,
"loss": 2.2475,
"step": 861
},
{
"epoch": 0.560832791151594,
"grad_norm": 0.2551681697368622,
"learning_rate": 0.0001,
"loss": 2.5585,
"step": 862
},
{
"epoch": 0.5614834092387768,
"grad_norm": 0.21423856914043427,
"learning_rate": 0.0001,
"loss": 2.1194,
"step": 863
},
{
"epoch": 0.5621340273259596,
"grad_norm": 0.22121264040470123,
"learning_rate": 0.0001,
"loss": 1.9257,
"step": 864
},
{
"epoch": 0.5627846454131424,
"grad_norm": 0.38684332370758057,
"learning_rate": 0.0001,
"loss": 2.5203,
"step": 865
},
{
"epoch": 0.5634352635003254,
"grad_norm": 0.20299634337425232,
"learning_rate": 0.0001,
"loss": 2.0868,
"step": 866
},
{
"epoch": 0.5640858815875082,
"grad_norm": 0.33485493063926697,
"learning_rate": 0.0001,
"loss": 2.457,
"step": 867
},
{
"epoch": 0.564736499674691,
"grad_norm": 0.23778866231441498,
"learning_rate": 0.0001,
"loss": 1.9863,
"step": 868
},
{
"epoch": 0.5653871177618738,
"grad_norm": 0.18562458455562592,
"learning_rate": 0.0001,
"loss": 1.915,
"step": 869
},
{
"epoch": 0.5660377358490566,
"grad_norm": 0.3780176341533661,
"learning_rate": 0.0001,
"loss": 2.5518,
"step": 870
},
{
"epoch": 0.5666883539362394,
"grad_norm": 0.1924014538526535,
"learning_rate": 0.0001,
"loss": 2.0665,
"step": 871
},
{
"epoch": 0.5673389720234222,
"grad_norm": 0.19788160920143127,
"learning_rate": 0.0001,
"loss": 1.9408,
"step": 872
},
{
"epoch": 0.5679895901106051,
"grad_norm": 0.2435147911310196,
"learning_rate": 0.0001,
"loss": 2.3716,
"step": 873
},
{
"epoch": 0.5686402081977879,
"grad_norm": 0.2023211270570755,
"learning_rate": 0.0001,
"loss": 2.2786,
"step": 874
},
{
"epoch": 0.5692908262849707,
"grad_norm": 0.29936715960502625,
"learning_rate": 0.0001,
"loss": 2.6689,
"step": 875
},
{
"epoch": 0.5699414443721535,
"grad_norm": 0.18846483528614044,
"learning_rate": 0.0001,
"loss": 1.9436,
"step": 876
},
{
"epoch": 0.5705920624593364,
"grad_norm": 0.44592785835266113,
"learning_rate": 0.0001,
"loss": 2.8648,
"step": 877
},
{
"epoch": 0.5712426805465192,
"grad_norm": 0.221640944480896,
"learning_rate": 0.0001,
"loss": 2.1613,
"step": 878
},
{
"epoch": 0.571893298633702,
"grad_norm": 0.22345726191997528,
"learning_rate": 0.0001,
"loss": 2.076,
"step": 879
},
{
"epoch": 0.5725439167208849,
"grad_norm": 0.20094214379787445,
"learning_rate": 0.0001,
"loss": 2.0474,
"step": 880
},
{
"epoch": 0.5731945348080677,
"grad_norm": 0.1997043937444687,
"learning_rate": 0.0001,
"loss": 1.9812,
"step": 881
},
{
"epoch": 0.5738451528952505,
"grad_norm": 0.3758605420589447,
"learning_rate": 0.0001,
"loss": 2.8357,
"step": 882
},
{
"epoch": 0.5744957709824333,
"grad_norm": 0.2940578758716583,
"learning_rate": 0.0001,
"loss": 2.4955,
"step": 883
},
{
"epoch": 0.5751463890696161,
"grad_norm": 0.2434762865304947,
"learning_rate": 0.0001,
"loss": 2.0011,
"step": 884
},
{
"epoch": 0.5757970071567989,
"grad_norm": 0.24335308372974396,
"learning_rate": 0.0001,
"loss": 2.5458,
"step": 885
},
{
"epoch": 0.5764476252439817,
"grad_norm": 0.2063351422548294,
"learning_rate": 0.0001,
"loss": 1.9801,
"step": 886
},
{
"epoch": 0.5770982433311646,
"grad_norm": 0.35102301836013794,
"learning_rate": 0.0001,
"loss": 2.5647,
"step": 887
},
{
"epoch": 0.5777488614183475,
"grad_norm": 0.22332875430583954,
"learning_rate": 0.0001,
"loss": 2.0542,
"step": 888
},
{
"epoch": 0.5783994795055303,
"grad_norm": 0.2073124796152115,
"learning_rate": 0.0001,
"loss": 1.9348,
"step": 889
},
{
"epoch": 0.5790500975927131,
"grad_norm": 0.21079733967781067,
"learning_rate": 0.0001,
"loss": 1.9829,
"step": 890
},
{
"epoch": 0.5797007156798959,
"grad_norm": 0.2842913866043091,
"learning_rate": 0.0001,
"loss": 2.7215,
"step": 891
},
{
"epoch": 0.5803513337670787,
"grad_norm": 0.2807595133781433,
"learning_rate": 0.0001,
"loss": 2.1827,
"step": 892
},
{
"epoch": 0.5810019518542615,
"grad_norm": 0.24955599009990692,
"learning_rate": 0.0001,
"loss": 2.6246,
"step": 893
},
{
"epoch": 0.5816525699414443,
"grad_norm": 0.23281241953372955,
"learning_rate": 0.0001,
"loss": 2.3944,
"step": 894
},
{
"epoch": 0.5823031880286272,
"grad_norm": 0.2617682218551636,
"learning_rate": 0.0001,
"loss": 2.6147,
"step": 895
},
{
"epoch": 0.58295380611581,
"grad_norm": 0.1915360391139984,
"learning_rate": 0.0001,
"loss": 2.0095,
"step": 896
},
{
"epoch": 0.5836044242029929,
"grad_norm": 0.20270249247550964,
"learning_rate": 0.0001,
"loss": 1.8983,
"step": 897
},
{
"epoch": 0.5842550422901757,
"grad_norm": 0.21804624795913696,
"learning_rate": 0.0001,
"loss": 2.0425,
"step": 898
},
{
"epoch": 0.5849056603773585,
"grad_norm": 0.25326576828956604,
"learning_rate": 0.0001,
"loss": 2.4875,
"step": 899
},
{
"epoch": 0.5855562784645413,
"grad_norm": 0.21714434027671814,
"learning_rate": 0.0001,
"loss": 2.269,
"step": 900
},
{
"epoch": 0.5862068965517241,
"grad_norm": 0.22771766781806946,
"learning_rate": 0.0001,
"loss": 2.3039,
"step": 901
},
{
"epoch": 0.586857514638907,
"grad_norm": 0.3638748824596405,
"learning_rate": 0.0001,
"loss": 2.7448,
"step": 902
},
{
"epoch": 0.5875081327260898,
"grad_norm": 0.20194686949253082,
"learning_rate": 0.0001,
"loss": 2.0141,
"step": 903
},
{
"epoch": 0.5881587508132726,
"grad_norm": 0.187494158744812,
"learning_rate": 0.0001,
"loss": 2.1188,
"step": 904
},
{
"epoch": 0.5888093689004554,
"grad_norm": 0.23371635377407074,
"learning_rate": 0.0001,
"loss": 2.6014,
"step": 905
},
{
"epoch": 0.5894599869876382,
"grad_norm": 0.2642146050930023,
"learning_rate": 0.0001,
"loss": 2.2053,
"step": 906
},
{
"epoch": 0.590110605074821,
"grad_norm": 0.20045514404773712,
"learning_rate": 0.0001,
"loss": 2.1828,
"step": 907
},
{
"epoch": 0.5907612231620039,
"grad_norm": 0.22904321551322937,
"learning_rate": 0.0001,
"loss": 2.3128,
"step": 908
},
{
"epoch": 0.5914118412491868,
"grad_norm": 0.36857542395591736,
"learning_rate": 0.0001,
"loss": 3.3891,
"step": 909
},
{
"epoch": 0.5920624593363696,
"grad_norm": 0.3417764902114868,
"learning_rate": 0.0001,
"loss": 2.6737,
"step": 910
},
{
"epoch": 0.5927130774235524,
"grad_norm": 0.46861669421195984,
"learning_rate": 0.0001,
"loss": 2.5329,
"step": 911
},
{
"epoch": 0.5933636955107352,
"grad_norm": 0.32909440994262695,
"learning_rate": 0.0001,
"loss": 2.4894,
"step": 912
},
{
"epoch": 0.594014313597918,
"grad_norm": 0.2176060974597931,
"learning_rate": 0.0001,
"loss": 1.9696,
"step": 913
},
{
"epoch": 0.5946649316851008,
"grad_norm": 0.27317941188812256,
"learning_rate": 0.0001,
"loss": 2.2179,
"step": 914
},
{
"epoch": 0.5953155497722836,
"grad_norm": 0.267123281955719,
"learning_rate": 0.0001,
"loss": 2.5464,
"step": 915
},
{
"epoch": 0.5959661678594665,
"grad_norm": 0.320402055978775,
"learning_rate": 0.0001,
"loss": 2.5021,
"step": 916
},
{
"epoch": 0.5966167859466494,
"grad_norm": 0.20610998570919037,
"learning_rate": 0.0001,
"loss": 2.0586,
"step": 917
},
{
"epoch": 0.5972674040338322,
"grad_norm": 0.2108345478773117,
"learning_rate": 0.0001,
"loss": 2.3278,
"step": 918
},
{
"epoch": 0.597918022121015,
"grad_norm": 0.18368126451969147,
"learning_rate": 0.0001,
"loss": 2.1026,
"step": 919
},
{
"epoch": 0.5985686402081978,
"grad_norm": 0.20730890333652496,
"learning_rate": 0.0001,
"loss": 2.1936,
"step": 920
},
{
"epoch": 0.5992192582953806,
"grad_norm": 0.2921161651611328,
"learning_rate": 0.0001,
"loss": 2.5618,
"step": 921
},
{
"epoch": 0.5998698763825634,
"grad_norm": 0.23977220058441162,
"learning_rate": 0.0001,
"loss": 2.533,
"step": 922
},
{
"epoch": 0.6005204944697463,
"grad_norm": 0.25839105248451233,
"learning_rate": 0.0001,
"loss": 2.7033,
"step": 923
},
{
"epoch": 0.6011711125569291,
"grad_norm": 0.214335098862648,
"learning_rate": 0.0001,
"loss": 1.9153,
"step": 924
},
{
"epoch": 0.6018217306441119,
"grad_norm": 0.19577006995677948,
"learning_rate": 0.0001,
"loss": 1.8612,
"step": 925
},
{
"epoch": 0.6024723487312947,
"grad_norm": 0.22480078041553497,
"learning_rate": 0.0001,
"loss": 2.2383,
"step": 926
},
{
"epoch": 0.6031229668184775,
"grad_norm": 0.2090427577495575,
"learning_rate": 0.0001,
"loss": 1.9532,
"step": 927
},
{
"epoch": 0.6037735849056604,
"grad_norm": 0.21045666933059692,
"learning_rate": 0.0001,
"loss": 2.1285,
"step": 928
},
{
"epoch": 0.6044242029928432,
"grad_norm": 0.2302238792181015,
"learning_rate": 0.0001,
"loss": 2.5368,
"step": 929
},
{
"epoch": 0.605074821080026,
"grad_norm": 0.22230245172977448,
"learning_rate": 0.0001,
"loss": 2.0551,
"step": 930
},
{
"epoch": 0.6057254391672089,
"grad_norm": 0.2619292140007019,
"learning_rate": 0.0001,
"loss": 2.5149,
"step": 931
},
{
"epoch": 0.6063760572543917,
"grad_norm": 0.20247308909893036,
"learning_rate": 0.0001,
"loss": 2.0032,
"step": 932
},
{
"epoch": 0.6070266753415745,
"grad_norm": 0.19772449135780334,
"learning_rate": 0.0001,
"loss": 1.9627,
"step": 933
},
{
"epoch": 0.6076772934287573,
"grad_norm": 0.1917680948972702,
"learning_rate": 0.0001,
"loss": 1.9659,
"step": 934
},
{
"epoch": 0.6083279115159401,
"grad_norm": 0.3457018733024597,
"learning_rate": 0.0001,
"loss": 2.4537,
"step": 935
},
{
"epoch": 0.6089785296031229,
"grad_norm": 0.2027028501033783,
"learning_rate": 0.0001,
"loss": 2.1681,
"step": 936
},
{
"epoch": 0.6096291476903057,
"grad_norm": 0.24525637924671173,
"learning_rate": 0.0001,
"loss": 2.0816,
"step": 937
},
{
"epoch": 0.6102797657774887,
"grad_norm": 0.2690584659576416,
"learning_rate": 0.0001,
"loss": 2.7011,
"step": 938
},
{
"epoch": 0.6109303838646715,
"grad_norm": 0.20961976051330566,
"learning_rate": 0.0001,
"loss": 2.576,
"step": 939
},
{
"epoch": 0.6115810019518543,
"grad_norm": 0.21827319264411926,
"learning_rate": 0.0001,
"loss": 2.2605,
"step": 940
},
{
"epoch": 0.6122316200390371,
"grad_norm": 0.20448362827301025,
"learning_rate": 0.0001,
"loss": 1.9963,
"step": 941
},
{
"epoch": 0.6128822381262199,
"grad_norm": 0.2513864040374756,
"learning_rate": 0.0001,
"loss": 2.4111,
"step": 942
},
{
"epoch": 0.6135328562134027,
"grad_norm": 0.28347763419151306,
"learning_rate": 0.0001,
"loss": 2.3459,
"step": 943
},
{
"epoch": 0.6141834743005855,
"grad_norm": 0.20679716765880585,
"learning_rate": 0.0001,
"loss": 1.9423,
"step": 944
},
{
"epoch": 0.6148340923877684,
"grad_norm": 0.20072445273399353,
"learning_rate": 0.0001,
"loss": 2.2,
"step": 945
},
{
"epoch": 0.6154847104749512,
"grad_norm": 0.2190425843000412,
"learning_rate": 0.0001,
"loss": 2.358,
"step": 946
},
{
"epoch": 0.616135328562134,
"grad_norm": 0.2672726511955261,
"learning_rate": 0.0001,
"loss": 2.5034,
"step": 947
},
{
"epoch": 0.6167859466493169,
"grad_norm": 0.20329232513904572,
"learning_rate": 0.0001,
"loss": 2.2972,
"step": 948
},
{
"epoch": 0.6174365647364997,
"grad_norm": 0.21593444049358368,
"learning_rate": 0.0001,
"loss": 2.8221,
"step": 949
},
{
"epoch": 0.6180871828236825,
"grad_norm": 0.22062361240386963,
"learning_rate": 0.0001,
"loss": 2.2051,
"step": 950
},
{
"epoch": 0.6187378009108653,
"grad_norm": 0.20640413463115692,
"learning_rate": 0.0001,
"loss": 2.1973,
"step": 951
},
{
"epoch": 0.6193884189980482,
"grad_norm": 0.18919388949871063,
"learning_rate": 0.0001,
"loss": 2.1166,
"step": 952
},
{
"epoch": 0.620039037085231,
"grad_norm": 0.18566597998142242,
"learning_rate": 0.0001,
"loss": 1.9342,
"step": 953
},
{
"epoch": 0.6206896551724138,
"grad_norm": 0.3724953234195709,
"learning_rate": 0.0001,
"loss": 3.0303,
"step": 954
},
{
"epoch": 0.6213402732595966,
"grad_norm": 0.24559584259986877,
"learning_rate": 0.0001,
"loss": 2.387,
"step": 955
},
{
"epoch": 0.6219908913467794,
"grad_norm": 0.20384235680103302,
"learning_rate": 0.0001,
"loss": 2.1224,
"step": 956
},
{
"epoch": 0.6226415094339622,
"grad_norm": 0.3225831687450409,
"learning_rate": 0.0001,
"loss": 2.4856,
"step": 957
},
{
"epoch": 0.623292127521145,
"grad_norm": 0.21676267683506012,
"learning_rate": 0.0001,
"loss": 2.3457,
"step": 958
},
{
"epoch": 0.623942745608328,
"grad_norm": 0.21707187592983246,
"learning_rate": 0.0001,
"loss": 2.3985,
"step": 959
},
{
"epoch": 0.6245933636955108,
"grad_norm": 0.311277836561203,
"learning_rate": 0.0001,
"loss": 2.3087,
"step": 960
},
{
"epoch": 0.6252439817826936,
"grad_norm": 0.18904085457324982,
"learning_rate": 0.0001,
"loss": 1.9421,
"step": 961
},
{
"epoch": 0.6258945998698764,
"grad_norm": 0.39046210050582886,
"learning_rate": 0.0001,
"loss": 2.7524,
"step": 962
},
{
"epoch": 0.6265452179570592,
"grad_norm": 0.18455897271633148,
"learning_rate": 0.0001,
"loss": 1.7536,
"step": 963
},
{
"epoch": 0.627195836044242,
"grad_norm": 0.1874053180217743,
"learning_rate": 0.0001,
"loss": 2.0853,
"step": 964
},
{
"epoch": 0.6278464541314248,
"grad_norm": 0.24766068160533905,
"learning_rate": 0.0001,
"loss": 2.8099,
"step": 965
},
{
"epoch": 0.6284970722186076,
"grad_norm": 0.20977729558944702,
"learning_rate": 0.0001,
"loss": 2.0339,
"step": 966
},
{
"epoch": 0.6291476903057905,
"grad_norm": 0.2659202516078949,
"learning_rate": 0.0001,
"loss": 2.1282,
"step": 967
},
{
"epoch": 0.6297983083929733,
"grad_norm": 0.23760046064853668,
"learning_rate": 0.0001,
"loss": 2.4225,
"step": 968
},
{
"epoch": 0.6304489264801562,
"grad_norm": 0.1884511113166809,
"learning_rate": 0.0001,
"loss": 1.972,
"step": 969
},
{
"epoch": 0.631099544567339,
"grad_norm": 0.2816404402256012,
"learning_rate": 0.0001,
"loss": 2.6831,
"step": 970
},
{
"epoch": 0.6317501626545218,
"grad_norm": 0.1874386966228485,
"learning_rate": 0.0001,
"loss": 2.0042,
"step": 971
},
{
"epoch": 0.6324007807417046,
"grad_norm": 0.21592558920383453,
"learning_rate": 0.0001,
"loss": 2.338,
"step": 972
},
{
"epoch": 0.6330513988288874,
"grad_norm": 0.22190915048122406,
"learning_rate": 0.0001,
"loss": 2.23,
"step": 973
},
{
"epoch": 0.6337020169160703,
"grad_norm": 0.23270365595817566,
"learning_rate": 0.0001,
"loss": 2.1849,
"step": 974
},
{
"epoch": 0.6343526350032531,
"grad_norm": 0.20524165034294128,
"learning_rate": 0.0001,
"loss": 1.8509,
"step": 975
},
{
"epoch": 0.6350032530904359,
"grad_norm": 0.27826493978500366,
"learning_rate": 0.0001,
"loss": 2.6736,
"step": 976
},
{
"epoch": 0.6356538711776187,
"grad_norm": 0.19887575507164001,
"learning_rate": 0.0001,
"loss": 2.1369,
"step": 977
},
{
"epoch": 0.6363044892648015,
"grad_norm": 0.3760605752468109,
"learning_rate": 0.0001,
"loss": 2.7617,
"step": 978
},
{
"epoch": 0.6369551073519844,
"grad_norm": 0.2116486132144928,
"learning_rate": 0.0001,
"loss": 2.1353,
"step": 979
},
{
"epoch": 0.6376057254391672,
"grad_norm": 0.20685400068759918,
"learning_rate": 0.0001,
"loss": 2.2221,
"step": 980
},
{
"epoch": 0.6382563435263501,
"grad_norm": 0.25631460547447205,
"learning_rate": 0.0001,
"loss": 2.2755,
"step": 981
},
{
"epoch": 0.6389069616135329,
"grad_norm": 0.2831932604312897,
"learning_rate": 0.0001,
"loss": 2.2544,
"step": 982
},
{
"epoch": 0.6395575797007157,
"grad_norm": 0.19301310181617737,
"learning_rate": 0.0001,
"loss": 2.1736,
"step": 983
},
{
"epoch": 0.6402081977878985,
"grad_norm": 0.18511143326759338,
"learning_rate": 0.0001,
"loss": 1.8847,
"step": 984
},
{
"epoch": 0.6408588158750813,
"grad_norm": 0.23753167688846588,
"learning_rate": 0.0001,
"loss": 2.131,
"step": 985
},
{
"epoch": 0.6415094339622641,
"grad_norm": 0.24566152691841125,
"learning_rate": 0.0001,
"loss": 2.2071,
"step": 986
},
{
"epoch": 0.6421600520494469,
"grad_norm": 0.21481812000274658,
"learning_rate": 0.0001,
"loss": 2.0292,
"step": 987
},
{
"epoch": 0.6428106701366298,
"grad_norm": 0.3042278587818146,
"learning_rate": 0.0001,
"loss": 2.6444,
"step": 988
},
{
"epoch": 0.6434612882238127,
"grad_norm": 0.30741778016090393,
"learning_rate": 0.0001,
"loss": 2.5146,
"step": 989
},
{
"epoch": 0.6441119063109955,
"grad_norm": 0.40835896134376526,
"learning_rate": 0.0001,
"loss": 2.9053,
"step": 990
},
{
"epoch": 0.6447625243981783,
"grad_norm": 0.21121574938297272,
"learning_rate": 0.0001,
"loss": 2.4513,
"step": 991
},
{
"epoch": 0.6454131424853611,
"grad_norm": 0.2634606659412384,
"learning_rate": 0.0001,
"loss": 2.3141,
"step": 992
},
{
"epoch": 0.6460637605725439,
"grad_norm": 0.2463708072900772,
"learning_rate": 0.0001,
"loss": 2.4421,
"step": 993
},
{
"epoch": 0.6467143786597267,
"grad_norm": 0.25485244393348694,
"learning_rate": 0.0001,
"loss": 2.3788,
"step": 994
},
{
"epoch": 0.6473649967469096,
"grad_norm": 0.20773370563983917,
"learning_rate": 0.0001,
"loss": 1.9861,
"step": 995
},
{
"epoch": 0.6480156148340924,
"grad_norm": 0.20728078484535217,
"learning_rate": 0.0001,
"loss": 2.3341,
"step": 996
},
{
"epoch": 0.6486662329212752,
"grad_norm": 0.26925981044769287,
"learning_rate": 0.0001,
"loss": 2.9172,
"step": 997
},
{
"epoch": 0.649316851008458,
"grad_norm": 0.21403877437114716,
"learning_rate": 0.0001,
"loss": 2.1318,
"step": 998
},
{
"epoch": 0.6499674690956408,
"grad_norm": 0.2597064673900604,
"learning_rate": 0.0001,
"loss": 2.4316,
"step": 999
},
{
"epoch": 0.6506180871828237,
"grad_norm": 0.26858747005462646,
"learning_rate": 0.0001,
"loss": 2.2716,
"step": 1000
},
{
"epoch": 0.6512687052700065,
"grad_norm": 0.5603036880493164,
"learning_rate": 0.0001,
"loss": 3.1137,
"step": 1001
},
{
"epoch": 0.6519193233571894,
"grad_norm": 0.2423018366098404,
"learning_rate": 0.0001,
"loss": 2.2346,
"step": 1002
},
{
"epoch": 0.6525699414443722,
"grad_norm": 0.22914621233940125,
"learning_rate": 0.0001,
"loss": 2.2852,
"step": 1003
},
{
"epoch": 0.653220559531555,
"grad_norm": 0.22781658172607422,
"learning_rate": 0.0001,
"loss": 2.1961,
"step": 1004
},
{
"epoch": 0.6538711776187378,
"grad_norm": 0.2614092528820038,
"learning_rate": 0.0001,
"loss": 2.0631,
"step": 1005
},
{
"epoch": 0.6545217957059206,
"grad_norm": 0.23658867180347443,
"learning_rate": 0.0001,
"loss": 2.0379,
"step": 1006
},
{
"epoch": 0.6551724137931034,
"grad_norm": 0.20862211287021637,
"learning_rate": 0.0001,
"loss": 2.2786,
"step": 1007
},
{
"epoch": 0.6558230318802862,
"grad_norm": 0.2251960188150406,
"learning_rate": 0.0001,
"loss": 2.06,
"step": 1008
},
{
"epoch": 0.656473649967469,
"grad_norm": 0.2885074317455292,
"learning_rate": 0.0001,
"loss": 2.2583,
"step": 1009
},
{
"epoch": 0.657124268054652,
"grad_norm": 0.20309656858444214,
"learning_rate": 0.0001,
"loss": 2.1557,
"step": 1010
},
{
"epoch": 0.6577748861418348,
"grad_norm": 0.20139531791210175,
"learning_rate": 0.0001,
"loss": 2.3419,
"step": 1011
},
{
"epoch": 0.6584255042290176,
"grad_norm": 0.2853332757949829,
"learning_rate": 0.0001,
"loss": 2.1415,
"step": 1012
},
{
"epoch": 0.6590761223162004,
"grad_norm": 0.2907620966434479,
"learning_rate": 0.0001,
"loss": 2.4452,
"step": 1013
},
{
"epoch": 0.6597267404033832,
"grad_norm": 0.18982461094856262,
"learning_rate": 0.0001,
"loss": 2.0215,
"step": 1014
},
{
"epoch": 0.660377358490566,
"grad_norm": 0.20890061557292938,
"learning_rate": 0.0001,
"loss": 2.0383,
"step": 1015
},
{
"epoch": 0.6610279765777488,
"grad_norm": 0.21294118463993073,
"learning_rate": 0.0001,
"loss": 1.7722,
"step": 1016
},
{
"epoch": 0.6616785946649317,
"grad_norm": 0.22494040429592133,
"learning_rate": 0.0001,
"loss": 2.034,
"step": 1017
},
{
"epoch": 0.6623292127521145,
"grad_norm": 0.25089555978775024,
"learning_rate": 0.0001,
"loss": 2.3322,
"step": 1018
},
{
"epoch": 0.6629798308392973,
"grad_norm": 0.18898023664951324,
"learning_rate": 0.0001,
"loss": 1.9914,
"step": 1019
},
{
"epoch": 0.6636304489264802,
"grad_norm": 0.221091166138649,
"learning_rate": 0.0001,
"loss": 2.1613,
"step": 1020
},
{
"epoch": 0.664281067013663,
"grad_norm": 0.22317297756671906,
"learning_rate": 0.0001,
"loss": 2.3438,
"step": 1021
},
{
"epoch": 0.6649316851008458,
"grad_norm": 0.18826670944690704,
"learning_rate": 0.0001,
"loss": 2.0218,
"step": 1022
},
{
"epoch": 0.6655823031880286,
"grad_norm": 0.22612391412258148,
"learning_rate": 0.0001,
"loss": 2.2931,
"step": 1023
},
{
"epoch": 0.6662329212752115,
"grad_norm": 0.3006114959716797,
"learning_rate": 0.0001,
"loss": 2.4949,
"step": 1024
},
{
"epoch": 0.6668835393623943,
"grad_norm": 0.1835569143295288,
"learning_rate": 0.0001,
"loss": 1.9396,
"step": 1025
},
{
"epoch": 0.6675341574495771,
"grad_norm": 0.19352416694164276,
"learning_rate": 0.0001,
"loss": 2.0038,
"step": 1026
},
{
"epoch": 0.6681847755367599,
"grad_norm": 0.2259102463722229,
"learning_rate": 0.0001,
"loss": 2.1818,
"step": 1027
},
{
"epoch": 0.6688353936239427,
"grad_norm": 0.20237034559249878,
"learning_rate": 0.0001,
"loss": 2.3196,
"step": 1028
},
{
"epoch": 0.6694860117111255,
"grad_norm": 0.1844060719013214,
"learning_rate": 0.0001,
"loss": 2.1389,
"step": 1029
},
{
"epoch": 0.6701366297983083,
"grad_norm": 0.21057841181755066,
"learning_rate": 0.0001,
"loss": 2.0058,
"step": 1030
},
{
"epoch": 0.6707872478854913,
"grad_norm": 0.20054426789283752,
"learning_rate": 0.0001,
"loss": 2.2874,
"step": 1031
},
{
"epoch": 0.6714378659726741,
"grad_norm": 0.2507307529449463,
"learning_rate": 0.0001,
"loss": 2.4245,
"step": 1032
},
{
"epoch": 0.6720884840598569,
"grad_norm": 0.21066251397132874,
"learning_rate": 0.0001,
"loss": 2.1688,
"step": 1033
},
{
"epoch": 0.6727391021470397,
"grad_norm": 0.22210632264614105,
"learning_rate": 0.0001,
"loss": 2.1985,
"step": 1034
},
{
"epoch": 0.6733897202342225,
"grad_norm": 0.21617744863033295,
"learning_rate": 0.0001,
"loss": 2.5918,
"step": 1035
},
{
"epoch": 0.6740403383214053,
"grad_norm": 0.46473971009254456,
"learning_rate": 0.0001,
"loss": 2.9341,
"step": 1036
},
{
"epoch": 0.6746909564085881,
"grad_norm": 0.20464558899402618,
"learning_rate": 0.0001,
"loss": 2.1654,
"step": 1037
},
{
"epoch": 0.675341574495771,
"grad_norm": 0.212956503033638,
"learning_rate": 0.0001,
"loss": 2.1959,
"step": 1038
},
{
"epoch": 0.6759921925829538,
"grad_norm": 0.2572340667247772,
"learning_rate": 0.0001,
"loss": 2.4918,
"step": 1039
},
{
"epoch": 0.6766428106701367,
"grad_norm": 0.3264685273170471,
"learning_rate": 0.0001,
"loss": 2.8708,
"step": 1040
},
{
"epoch": 0.6772934287573195,
"grad_norm": 0.22119931876659393,
"learning_rate": 0.0001,
"loss": 2.2222,
"step": 1041
},
{
"epoch": 0.6779440468445023,
"grad_norm": 0.24374569952487946,
"learning_rate": 0.0001,
"loss": 2.2457,
"step": 1042
},
{
"epoch": 0.6785946649316851,
"grad_norm": 0.2548108696937561,
"learning_rate": 0.0001,
"loss": 2.485,
"step": 1043
},
{
"epoch": 0.6792452830188679,
"grad_norm": 0.20976418256759644,
"learning_rate": 0.0001,
"loss": 2.3068,
"step": 1044
},
{
"epoch": 0.6798959011060507,
"grad_norm": 0.25135618448257446,
"learning_rate": 0.0001,
"loss": 2.1083,
"step": 1045
},
{
"epoch": 0.6805465191932336,
"grad_norm": 0.2677728831768036,
"learning_rate": 0.0001,
"loss": 2.4257,
"step": 1046
},
{
"epoch": 0.6811971372804164,
"grad_norm": 0.20250125229358673,
"learning_rate": 0.0001,
"loss": 2.0643,
"step": 1047
},
{
"epoch": 0.6818477553675992,
"grad_norm": 0.20850299298763275,
"learning_rate": 0.0001,
"loss": 2.0383,
"step": 1048
},
{
"epoch": 0.682498373454782,
"grad_norm": 0.21116970479488373,
"learning_rate": 0.0001,
"loss": 2.0259,
"step": 1049
},
{
"epoch": 0.6831489915419648,
"grad_norm": 0.2572707235813141,
"learning_rate": 0.0001,
"loss": 2.1982,
"step": 1050
},
{
"epoch": 0.6837996096291477,
"grad_norm": 0.2010831981897354,
"learning_rate": 0.0001,
"loss": 2.0687,
"step": 1051
},
{
"epoch": 0.6844502277163305,
"grad_norm": 0.23995356261730194,
"learning_rate": 0.0001,
"loss": 2.1938,
"step": 1052
},
{
"epoch": 0.6851008458035134,
"grad_norm": 0.21428103744983673,
"learning_rate": 0.0001,
"loss": 2.2514,
"step": 1053
},
{
"epoch": 0.6857514638906962,
"grad_norm": 0.21370433270931244,
"learning_rate": 0.0001,
"loss": 2.2523,
"step": 1054
},
{
"epoch": 0.686402081977879,
"grad_norm": 0.2131800800561905,
"learning_rate": 0.0001,
"loss": 2.2413,
"step": 1055
},
{
"epoch": 0.6870527000650618,
"grad_norm": 0.20007681846618652,
"learning_rate": 0.0001,
"loss": 2.176,
"step": 1056
},
{
"epoch": 0.6877033181522446,
"grad_norm": 0.2108153998851776,
"learning_rate": 0.0001,
"loss": 2.1081,
"step": 1057
},
{
"epoch": 0.6883539362394274,
"grad_norm": 0.19952858984470367,
"learning_rate": 0.0001,
"loss": 2.0249,
"step": 1058
},
{
"epoch": 0.6890045543266102,
"grad_norm": 0.20590882003307343,
"learning_rate": 0.0001,
"loss": 2.1949,
"step": 1059
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.2126530408859253,
"learning_rate": 0.0001,
"loss": 2.2726,
"step": 1060
},
{
"epoch": 0.690305790500976,
"grad_norm": 0.30162468552589417,
"learning_rate": 0.0001,
"loss": 2.5032,
"step": 1061
},
{
"epoch": 0.6909564085881588,
"grad_norm": 0.24452462792396545,
"learning_rate": 0.0001,
"loss": 2.3021,
"step": 1062
},
{
"epoch": 0.6916070266753416,
"grad_norm": 0.17819760739803314,
"learning_rate": 0.0001,
"loss": 1.9628,
"step": 1063
},
{
"epoch": 0.6922576447625244,
"grad_norm": 0.17437471449375153,
"learning_rate": 0.0001,
"loss": 1.879,
"step": 1064
},
{
"epoch": 0.6929082628497072,
"grad_norm": 0.3003963232040405,
"learning_rate": 0.0001,
"loss": 2.4695,
"step": 1065
},
{
"epoch": 0.69355888093689,
"grad_norm": 0.2007562667131424,
"learning_rate": 0.0001,
"loss": 1.9754,
"step": 1066
},
{
"epoch": 0.6942094990240729,
"grad_norm": 0.21425336599349976,
"learning_rate": 0.0001,
"loss": 2.1767,
"step": 1067
},
{
"epoch": 0.6948601171112557,
"grad_norm": 0.20287302136421204,
"learning_rate": 0.0001,
"loss": 1.9933,
"step": 1068
},
{
"epoch": 0.6955107351984385,
"grad_norm": 0.2762700021266937,
"learning_rate": 0.0001,
"loss": 2.1079,
"step": 1069
},
{
"epoch": 0.6961613532856213,
"grad_norm": 0.18358288705348969,
"learning_rate": 0.0001,
"loss": 1.9445,
"step": 1070
},
{
"epoch": 0.6968119713728042,
"grad_norm": 0.21157526969909668,
"learning_rate": 0.0001,
"loss": 2.169,
"step": 1071
},
{
"epoch": 0.697462589459987,
"grad_norm": 0.1847715675830841,
"learning_rate": 0.0001,
"loss": 2.0757,
"step": 1072
},
{
"epoch": 0.6981132075471698,
"grad_norm": 0.1923181712627411,
"learning_rate": 0.0001,
"loss": 2.2365,
"step": 1073
},
{
"epoch": 0.6987638256343527,
"grad_norm": 0.26491835713386536,
"learning_rate": 0.0001,
"loss": 2.4613,
"step": 1074
},
{
"epoch": 0.6994144437215355,
"grad_norm": 0.17674419283866882,
"learning_rate": 0.0001,
"loss": 1.9706,
"step": 1075
},
{
"epoch": 0.7000650618087183,
"grad_norm": 0.19894379377365112,
"learning_rate": 0.0001,
"loss": 1.9227,
"step": 1076
},
{
"epoch": 0.7007156798959011,
"grad_norm": 0.19496971368789673,
"learning_rate": 0.0001,
"loss": 2.1783,
"step": 1077
},
{
"epoch": 0.7013662979830839,
"grad_norm": 0.20685461163520813,
"learning_rate": 0.0001,
"loss": 2.1542,
"step": 1078
},
{
"epoch": 0.7020169160702667,
"grad_norm": 0.23061524331569672,
"learning_rate": 0.0001,
"loss": 2.3346,
"step": 1079
},
{
"epoch": 0.7026675341574495,
"grad_norm": 0.2044321447610855,
"learning_rate": 0.0001,
"loss": 2.0157,
"step": 1080
},
{
"epoch": 0.7033181522446325,
"grad_norm": 0.18851466476917267,
"learning_rate": 0.0001,
"loss": 2.2045,
"step": 1081
},
{
"epoch": 0.7039687703318153,
"grad_norm": 0.18530018627643585,
"learning_rate": 0.0001,
"loss": 2.0695,
"step": 1082
},
{
"epoch": 0.7046193884189981,
"grad_norm": 0.23562023043632507,
"learning_rate": 0.0001,
"loss": 2.3919,
"step": 1083
},
{
"epoch": 0.7052700065061809,
"grad_norm": 0.22246116399765015,
"learning_rate": 0.0001,
"loss": 2.5821,
"step": 1084
},
{
"epoch": 0.7059206245933637,
"grad_norm": 0.2134729027748108,
"learning_rate": 0.0001,
"loss": 2.2181,
"step": 1085
},
{
"epoch": 0.7065712426805465,
"grad_norm": 0.29674917459487915,
"learning_rate": 0.0001,
"loss": 2.5069,
"step": 1086
},
{
"epoch": 0.7072218607677293,
"grad_norm": 0.2098974883556366,
"learning_rate": 0.0001,
"loss": 2.3307,
"step": 1087
},
{
"epoch": 0.7078724788549121,
"grad_norm": 0.27041876316070557,
"learning_rate": 0.0001,
"loss": 2.8081,
"step": 1088
},
{
"epoch": 0.708523096942095,
"grad_norm": 0.19734299182891846,
"learning_rate": 0.0001,
"loss": 2.0588,
"step": 1089
},
{
"epoch": 0.7091737150292778,
"grad_norm": 0.22952257096767426,
"learning_rate": 0.0001,
"loss": 2.2607,
"step": 1090
},
{
"epoch": 0.7098243331164606,
"grad_norm": 0.20846691727638245,
"learning_rate": 0.0001,
"loss": 2.1657,
"step": 1091
},
{
"epoch": 0.7104749512036435,
"grad_norm": 0.19664259254932404,
"learning_rate": 0.0001,
"loss": 2.1256,
"step": 1092
},
{
"epoch": 0.7111255692908263,
"grad_norm": 0.23994791507720947,
"learning_rate": 0.0001,
"loss": 2.5377,
"step": 1093
},
{
"epoch": 0.7117761873780091,
"grad_norm": 0.22439789772033691,
"learning_rate": 0.0001,
"loss": 2.6225,
"step": 1094
},
{
"epoch": 0.7124268054651919,
"grad_norm": 0.20211316645145416,
"learning_rate": 0.0001,
"loss": 2.0582,
"step": 1095
},
{
"epoch": 0.7130774235523748,
"grad_norm": 0.23308198153972626,
"learning_rate": 0.0001,
"loss": 2.4341,
"step": 1096
},
{
"epoch": 0.7137280416395576,
"grad_norm": 0.17806245386600494,
"learning_rate": 0.0001,
"loss": 2.0211,
"step": 1097
},
{
"epoch": 0.7143786597267404,
"grad_norm": 0.20525243878364563,
"learning_rate": 0.0001,
"loss": 2.1248,
"step": 1098
},
{
"epoch": 0.7150292778139232,
"grad_norm": 0.22835716605186462,
"learning_rate": 0.0001,
"loss": 2.2993,
"step": 1099
},
{
"epoch": 0.715679895901106,
"grad_norm": 0.37078213691711426,
"learning_rate": 0.0001,
"loss": 3.1289,
"step": 1100
},
{
"epoch": 0.7163305139882888,
"grad_norm": 0.22253082692623138,
"learning_rate": 0.0001,
"loss": 2.2304,
"step": 1101
},
{
"epoch": 0.7169811320754716,
"grad_norm": 0.20494401454925537,
"learning_rate": 0.0001,
"loss": 1.9473,
"step": 1102
},
{
"epoch": 0.7176317501626546,
"grad_norm": 0.22128112614154816,
"learning_rate": 0.0001,
"loss": 1.993,
"step": 1103
},
{
"epoch": 0.7182823682498374,
"grad_norm": 0.20786182582378387,
"learning_rate": 0.0001,
"loss": 2.0048,
"step": 1104
},
{
"epoch": 0.7189329863370202,
"grad_norm": 0.27697819471359253,
"learning_rate": 0.0001,
"loss": 2.372,
"step": 1105
},
{
"epoch": 0.719583604424203,
"grad_norm": 0.26237788796424866,
"learning_rate": 0.0001,
"loss": 1.9573,
"step": 1106
},
{
"epoch": 0.7202342225113858,
"grad_norm": 0.2544906437397003,
"learning_rate": 0.0001,
"loss": 2.2805,
"step": 1107
},
{
"epoch": 0.7208848405985686,
"grad_norm": 0.2175043374300003,
"learning_rate": 0.0001,
"loss": 2.3201,
"step": 1108
},
{
"epoch": 0.7215354586857514,
"grad_norm": 0.19637277722358704,
"learning_rate": 0.0001,
"loss": 1.8868,
"step": 1109
},
{
"epoch": 0.7221860767729343,
"grad_norm": 0.19888024032115936,
"learning_rate": 0.0001,
"loss": 2.0324,
"step": 1110
},
{
"epoch": 0.7228366948601171,
"grad_norm": 0.20008981227874756,
"learning_rate": 0.0001,
"loss": 2.2898,
"step": 1111
},
{
"epoch": 0.7234873129473,
"grad_norm": 0.25185343623161316,
"learning_rate": 0.0001,
"loss": 2.2424,
"step": 1112
},
{
"epoch": 0.7241379310344828,
"grad_norm": 0.2434062957763672,
"learning_rate": 0.0001,
"loss": 2.2884,
"step": 1113
},
{
"epoch": 0.7247885491216656,
"grad_norm": 0.2278825044631958,
"learning_rate": 0.0001,
"loss": 2.1751,
"step": 1114
},
{
"epoch": 0.7254391672088484,
"grad_norm": 0.23180316388607025,
"learning_rate": 0.0001,
"loss": 2.6033,
"step": 1115
},
{
"epoch": 0.7260897852960312,
"grad_norm": 0.18574117124080658,
"learning_rate": 0.0001,
"loss": 2.3172,
"step": 1116
},
{
"epoch": 0.7267404033832141,
"grad_norm": 0.286155641078949,
"learning_rate": 0.0001,
"loss": 2.0482,
"step": 1117
},
{
"epoch": 0.7273910214703969,
"grad_norm": 0.1757357120513916,
"learning_rate": 0.0001,
"loss": 1.8881,
"step": 1118
},
{
"epoch": 0.7280416395575797,
"grad_norm": 0.25008201599121094,
"learning_rate": 0.0001,
"loss": 2.3797,
"step": 1119
},
{
"epoch": 0.7286922576447625,
"grad_norm": 0.29816892743110657,
"learning_rate": 0.0001,
"loss": 2.9163,
"step": 1120
},
{
"epoch": 0.7293428757319453,
"grad_norm": 0.1951293647289276,
"learning_rate": 0.0001,
"loss": 2.0613,
"step": 1121
},
{
"epoch": 0.7299934938191281,
"grad_norm": 0.23593062162399292,
"learning_rate": 0.0001,
"loss": 2.2103,
"step": 1122
},
{
"epoch": 0.730644111906311,
"grad_norm": 0.18619036674499512,
"learning_rate": 0.0001,
"loss": 1.9223,
"step": 1123
},
{
"epoch": 0.7312947299934938,
"grad_norm": 0.20853224396705627,
"learning_rate": 0.0001,
"loss": 2.2651,
"step": 1124
},
{
"epoch": 0.7319453480806767,
"grad_norm": 0.27427271008491516,
"learning_rate": 0.0001,
"loss": 2.3866,
"step": 1125
},
{
"epoch": 0.7325959661678595,
"grad_norm": 0.35531318187713623,
"learning_rate": 0.0001,
"loss": 2.8333,
"step": 1126
},
{
"epoch": 0.7332465842550423,
"grad_norm": 0.21375155448913574,
"learning_rate": 0.0001,
"loss": 2.0703,
"step": 1127
},
{
"epoch": 0.7338972023422251,
"grad_norm": 0.24240247905254364,
"learning_rate": 0.0001,
"loss": 2.3032,
"step": 1128
},
{
"epoch": 0.7345478204294079,
"grad_norm": 0.2277136594057083,
"learning_rate": 0.0001,
"loss": 2.585,
"step": 1129
},
{
"epoch": 0.7351984385165907,
"grad_norm": 0.20665140450000763,
"learning_rate": 0.0001,
"loss": 2.1351,
"step": 1130
},
{
"epoch": 0.7358490566037735,
"grad_norm": 0.2534540891647339,
"learning_rate": 0.0001,
"loss": 2.5023,
"step": 1131
},
{
"epoch": 0.7364996746909565,
"grad_norm": 0.19695554673671722,
"learning_rate": 0.0001,
"loss": 1.9286,
"step": 1132
},
{
"epoch": 0.7371502927781393,
"grad_norm": 0.18500645458698273,
"learning_rate": 0.0001,
"loss": 2.0609,
"step": 1133
},
{
"epoch": 0.7378009108653221,
"grad_norm": 0.2103162556886673,
"learning_rate": 0.0001,
"loss": 2.2247,
"step": 1134
},
{
"epoch": 0.7384515289525049,
"grad_norm": 0.20303300023078918,
"learning_rate": 0.0001,
"loss": 2.1164,
"step": 1135
},
{
"epoch": 0.7391021470396877,
"grad_norm": 0.23574739694595337,
"learning_rate": 0.0001,
"loss": 2.6325,
"step": 1136
},
{
"epoch": 0.7397527651268705,
"grad_norm": 0.2764929234981537,
"learning_rate": 0.0001,
"loss": 2.3049,
"step": 1137
},
{
"epoch": 0.7404033832140533,
"grad_norm": 0.23995018005371094,
"learning_rate": 0.0001,
"loss": 2.3196,
"step": 1138
},
{
"epoch": 0.7410540013012362,
"grad_norm": 0.19074063003063202,
"learning_rate": 0.0001,
"loss": 2.1566,
"step": 1139
},
{
"epoch": 0.741704619388419,
"grad_norm": 0.18186306953430176,
"learning_rate": 0.0001,
"loss": 1.9629,
"step": 1140
},
{
"epoch": 0.7423552374756018,
"grad_norm": 0.23841345310211182,
"learning_rate": 0.0001,
"loss": 2.1942,
"step": 1141
},
{
"epoch": 0.7430058555627846,
"grad_norm": 0.19697019457817078,
"learning_rate": 0.0001,
"loss": 2.0186,
"step": 1142
},
{
"epoch": 0.7436564736499675,
"grad_norm": 0.2117876410484314,
"learning_rate": 0.0001,
"loss": 2.4395,
"step": 1143
},
{
"epoch": 0.7443070917371503,
"grad_norm": 0.26921918988227844,
"learning_rate": 0.0001,
"loss": 2.4332,
"step": 1144
},
{
"epoch": 0.7449577098243331,
"grad_norm": 0.18999671936035156,
"learning_rate": 0.0001,
"loss": 2.0209,
"step": 1145
},
{
"epoch": 0.745608327911516,
"grad_norm": 0.22686484456062317,
"learning_rate": 0.0001,
"loss": 2.4369,
"step": 1146
},
{
"epoch": 0.7462589459986988,
"grad_norm": 0.22974656522274017,
"learning_rate": 0.0001,
"loss": 2.3737,
"step": 1147
},
{
"epoch": 0.7469095640858816,
"grad_norm": 0.19007977843284607,
"learning_rate": 0.0001,
"loss": 2.145,
"step": 1148
},
{
"epoch": 0.7475601821730644,
"grad_norm": 0.23000845313072205,
"learning_rate": 0.0001,
"loss": 2.0555,
"step": 1149
},
{
"epoch": 0.7482108002602472,
"grad_norm": 0.33339783549308777,
"learning_rate": 0.0001,
"loss": 2.7318,
"step": 1150
},
{
"epoch": 0.74886141834743,
"grad_norm": 0.18458595871925354,
"learning_rate": 0.0001,
"loss": 1.7868,
"step": 1151
},
{
"epoch": 0.7495120364346128,
"grad_norm": 0.2283509373664856,
"learning_rate": 0.0001,
"loss": 2.2609,
"step": 1152
},
{
"epoch": 0.7501626545217958,
"grad_norm": 0.31175729632377625,
"learning_rate": 0.0001,
"loss": 2.5524,
"step": 1153
},
{
"epoch": 0.7508132726089786,
"grad_norm": 0.18617112934589386,
"learning_rate": 0.0001,
"loss": 2.2029,
"step": 1154
},
{
"epoch": 0.7514638906961614,
"grad_norm": 0.28690317273139954,
"learning_rate": 0.0001,
"loss": 2.4705,
"step": 1155
},
{
"epoch": 0.7521145087833442,
"grad_norm": 0.2267671674489975,
"learning_rate": 0.0001,
"loss": 2.1093,
"step": 1156
},
{
"epoch": 0.752765126870527,
"grad_norm": 0.21956512331962585,
"learning_rate": 0.0001,
"loss": 2.0962,
"step": 1157
},
{
"epoch": 0.7534157449577098,
"grad_norm": 0.2681393027305603,
"learning_rate": 0.0001,
"loss": 2.35,
"step": 1158
},
{
"epoch": 0.7540663630448926,
"grad_norm": 0.23306699097156525,
"learning_rate": 0.0001,
"loss": 2.4911,
"step": 1159
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.3148876428604126,
"learning_rate": 0.0001,
"loss": 2.8802,
"step": 1160
},
{
"epoch": 0.7553675992192583,
"grad_norm": 0.2260347157716751,
"learning_rate": 0.0001,
"loss": 1.9286,
"step": 1161
},
{
"epoch": 0.7560182173064411,
"grad_norm": 0.24939195811748505,
"learning_rate": 0.0001,
"loss": 2.3544,
"step": 1162
},
{
"epoch": 0.756668835393624,
"grad_norm": 0.21007601916790009,
"learning_rate": 0.0001,
"loss": 2.0132,
"step": 1163
},
{
"epoch": 0.7573194534808068,
"grad_norm": 0.2570975720882416,
"learning_rate": 0.0001,
"loss": 1.9665,
"step": 1164
},
{
"epoch": 0.7579700715679896,
"grad_norm": 0.2818357050418854,
"learning_rate": 0.0001,
"loss": 2.2252,
"step": 1165
},
{
"epoch": 0.7586206896551724,
"grad_norm": 0.22388941049575806,
"learning_rate": 0.0001,
"loss": 2.4553,
"step": 1166
},
{
"epoch": 0.7592713077423552,
"grad_norm": 0.22799374163150787,
"learning_rate": 0.0001,
"loss": 2.4447,
"step": 1167
},
{
"epoch": 0.7599219258295381,
"grad_norm": 0.2610357105731964,
"learning_rate": 0.0001,
"loss": 2.4024,
"step": 1168
},
{
"epoch": 0.7605725439167209,
"grad_norm": 0.39793217182159424,
"learning_rate": 0.0001,
"loss": 3.1529,
"step": 1169
},
{
"epoch": 0.7612231620039037,
"grad_norm": 0.19805116951465607,
"learning_rate": 0.0001,
"loss": 1.9483,
"step": 1170
},
{
"epoch": 0.7618737800910865,
"grad_norm": 0.208368182182312,
"learning_rate": 0.0001,
"loss": 2.1785,
"step": 1171
},
{
"epoch": 0.7625243981782693,
"grad_norm": 0.25101637840270996,
"learning_rate": 0.0001,
"loss": 2.2517,
"step": 1172
},
{
"epoch": 0.7631750162654521,
"grad_norm": 0.27432793378829956,
"learning_rate": 0.0001,
"loss": 2.4759,
"step": 1173
},
{
"epoch": 0.763825634352635,
"grad_norm": 0.18746371567249298,
"learning_rate": 0.0001,
"loss": 2.0188,
"step": 1174
},
{
"epoch": 0.7644762524398179,
"grad_norm": 0.2882263958454132,
"learning_rate": 0.0001,
"loss": 2.2948,
"step": 1175
},
{
"epoch": 0.7651268705270007,
"grad_norm": 0.22075092792510986,
"learning_rate": 0.0001,
"loss": 2.4894,
"step": 1176
},
{
"epoch": 0.7657774886141835,
"grad_norm": 0.20792776346206665,
"learning_rate": 0.0001,
"loss": 1.8502,
"step": 1177
},
{
"epoch": 0.7664281067013663,
"grad_norm": 0.2436477392911911,
"learning_rate": 0.0001,
"loss": 2.1296,
"step": 1178
},
{
"epoch": 0.7670787247885491,
"grad_norm": 0.2839182913303375,
"learning_rate": 0.0001,
"loss": 2.8409,
"step": 1179
},
{
"epoch": 0.7677293428757319,
"grad_norm": 0.1826743334531784,
"learning_rate": 0.0001,
"loss": 1.941,
"step": 1180
},
{
"epoch": 0.7683799609629147,
"grad_norm": 0.2757255434989929,
"learning_rate": 0.0001,
"loss": 2.7297,
"step": 1181
},
{
"epoch": 0.7690305790500976,
"grad_norm": 0.23313826322555542,
"learning_rate": 0.0001,
"loss": 2.8796,
"step": 1182
},
{
"epoch": 0.7696811971372804,
"grad_norm": 0.28900882601737976,
"learning_rate": 0.0001,
"loss": 2.313,
"step": 1183
},
{
"epoch": 0.7703318152244633,
"grad_norm": 0.32883039116859436,
"learning_rate": 0.0001,
"loss": 3.041,
"step": 1184
},
{
"epoch": 0.7709824333116461,
"grad_norm": 0.2116912454366684,
"learning_rate": 0.0001,
"loss": 1.9891,
"step": 1185
},
{
"epoch": 0.7716330513988289,
"grad_norm": 0.2055017203092575,
"learning_rate": 0.0001,
"loss": 1.9567,
"step": 1186
},
{
"epoch": 0.7722836694860117,
"grad_norm": 0.2978801131248474,
"learning_rate": 0.0001,
"loss": 2.3322,
"step": 1187
},
{
"epoch": 0.7729342875731945,
"grad_norm": 0.21910034120082855,
"learning_rate": 0.0001,
"loss": 2.0262,
"step": 1188
},
{
"epoch": 0.7735849056603774,
"grad_norm": 0.19952894747257233,
"learning_rate": 0.0001,
"loss": 2.0621,
"step": 1189
},
{
"epoch": 0.7742355237475602,
"grad_norm": 0.20744554698467255,
"learning_rate": 0.0001,
"loss": 2.1154,
"step": 1190
},
{
"epoch": 0.774886141834743,
"grad_norm": 0.23886847496032715,
"learning_rate": 0.0001,
"loss": 2.3023,
"step": 1191
},
{
"epoch": 0.7755367599219258,
"grad_norm": 0.20722374320030212,
"learning_rate": 0.0001,
"loss": 2.2384,
"step": 1192
},
{
"epoch": 0.7761873780091086,
"grad_norm": 0.23317816853523254,
"learning_rate": 0.0001,
"loss": 2.6381,
"step": 1193
},
{
"epoch": 0.7768379960962914,
"grad_norm": 0.2527480125427246,
"learning_rate": 0.0001,
"loss": 2.1711,
"step": 1194
},
{
"epoch": 0.7774886141834743,
"grad_norm": 0.23817451298236847,
"learning_rate": 0.0001,
"loss": 2.6561,
"step": 1195
},
{
"epoch": 0.7781392322706572,
"grad_norm": 0.2609005570411682,
"learning_rate": 0.0001,
"loss": 2.5488,
"step": 1196
},
{
"epoch": 0.77878985035784,
"grad_norm": 0.19870908558368683,
"learning_rate": 0.0001,
"loss": 2.0435,
"step": 1197
},
{
"epoch": 0.7794404684450228,
"grad_norm": 0.20385386049747467,
"learning_rate": 0.0001,
"loss": 1.9711,
"step": 1198
},
{
"epoch": 0.7800910865322056,
"grad_norm": 0.20179738104343414,
"learning_rate": 0.0001,
"loss": 2.0247,
"step": 1199
},
{
"epoch": 0.7807417046193884,
"grad_norm": 0.40090981125831604,
"learning_rate": 0.0001,
"loss": 2.795,
"step": 1200
},
{
"epoch": 0.7813923227065712,
"grad_norm": 0.1885748654603958,
"learning_rate": 0.0001,
"loss": 2.1588,
"step": 1201
},
{
"epoch": 0.782042940793754,
"grad_norm": 0.21952667832374573,
"learning_rate": 0.0001,
"loss": 2.0901,
"step": 1202
},
{
"epoch": 0.7826935588809368,
"grad_norm": 0.2344968616962433,
"learning_rate": 0.0001,
"loss": 1.9943,
"step": 1203
},
{
"epoch": 0.7833441769681198,
"grad_norm": 0.3153589069843292,
"learning_rate": 0.0001,
"loss": 2.59,
"step": 1204
},
{
"epoch": 0.7839947950553026,
"grad_norm": 0.1870599389076233,
"learning_rate": 0.0001,
"loss": 1.9435,
"step": 1205
},
{
"epoch": 0.7846454131424854,
"grad_norm": 0.189214825630188,
"learning_rate": 0.0001,
"loss": 2.128,
"step": 1206
},
{
"epoch": 0.7852960312296682,
"grad_norm": 0.22551633417606354,
"learning_rate": 0.0001,
"loss": 2.3913,
"step": 1207
},
{
"epoch": 0.785946649316851,
"grad_norm": 0.19963033497333527,
"learning_rate": 0.0001,
"loss": 2.1456,
"step": 1208
},
{
"epoch": 0.7865972674040338,
"grad_norm": 0.2087828814983368,
"learning_rate": 0.0001,
"loss": 2.3486,
"step": 1209
},
{
"epoch": 0.7872478854912166,
"grad_norm": 0.19814416766166687,
"learning_rate": 0.0001,
"loss": 2.0208,
"step": 1210
},
{
"epoch": 0.7878985035783995,
"grad_norm": 0.20670342445373535,
"learning_rate": 0.0001,
"loss": 2.1276,
"step": 1211
},
{
"epoch": 0.7885491216655823,
"grad_norm": 0.1881658136844635,
"learning_rate": 0.0001,
"loss": 2.0502,
"step": 1212
},
{
"epoch": 0.7891997397527651,
"grad_norm": 0.2015887349843979,
"learning_rate": 0.0001,
"loss": 2.2935,
"step": 1213
},
{
"epoch": 0.789850357839948,
"grad_norm": 0.23532694578170776,
"learning_rate": 0.0001,
"loss": 2.8046,
"step": 1214
},
{
"epoch": 0.7905009759271308,
"grad_norm": 0.18583200871944427,
"learning_rate": 0.0001,
"loss": 1.7999,
"step": 1215
},
{
"epoch": 0.7911515940143136,
"grad_norm": 0.23056970536708832,
"learning_rate": 0.0001,
"loss": 2.126,
"step": 1216
},
{
"epoch": 0.7918022121014964,
"grad_norm": 0.3166569471359253,
"learning_rate": 0.0001,
"loss": 3.0332,
"step": 1217
},
{
"epoch": 0.7924528301886793,
"grad_norm": 0.273381769657135,
"learning_rate": 0.0001,
"loss": 2.2258,
"step": 1218
},
{
"epoch": 0.7931034482758621,
"grad_norm": 0.3166522979736328,
"learning_rate": 0.0001,
"loss": 2.35,
"step": 1219
},
{
"epoch": 0.7937540663630449,
"grad_norm": 0.1906355321407318,
"learning_rate": 0.0001,
"loss": 1.9739,
"step": 1220
},
{
"epoch": 0.7944046844502277,
"grad_norm": 0.2339126616716385,
"learning_rate": 0.0001,
"loss": 2.3575,
"step": 1221
},
{
"epoch": 0.7950553025374105,
"grad_norm": 0.2760171592235565,
"learning_rate": 0.0001,
"loss": 2.4708,
"step": 1222
},
{
"epoch": 0.7957059206245933,
"grad_norm": 0.17487159371376038,
"learning_rate": 0.0001,
"loss": 1.7924,
"step": 1223
},
{
"epoch": 0.7963565387117761,
"grad_norm": 0.19386877119541168,
"learning_rate": 0.0001,
"loss": 2.3044,
"step": 1224
},
{
"epoch": 0.7970071567989591,
"grad_norm": 0.18056143820285797,
"learning_rate": 0.0001,
"loss": 1.9543,
"step": 1225
},
{
"epoch": 0.7976577748861419,
"grad_norm": 0.3085278868675232,
"learning_rate": 0.0001,
"loss": 2.2131,
"step": 1226
},
{
"epoch": 0.7983083929733247,
"grad_norm": 0.1960904896259308,
"learning_rate": 0.0001,
"loss": 2.0918,
"step": 1227
},
{
"epoch": 0.7989590110605075,
"grad_norm": 0.19437837600708008,
"learning_rate": 0.0001,
"loss": 2.2241,
"step": 1228
},
{
"epoch": 0.7996096291476903,
"grad_norm": 0.2129238396883011,
"learning_rate": 0.0001,
"loss": 2.1891,
"step": 1229
},
{
"epoch": 0.8002602472348731,
"grad_norm": 0.20101650059223175,
"learning_rate": 0.0001,
"loss": 2.1341,
"step": 1230
},
{
"epoch": 0.8009108653220559,
"grad_norm": 0.20897014439105988,
"learning_rate": 0.0001,
"loss": 2.0937,
"step": 1231
},
{
"epoch": 0.8015614834092388,
"grad_norm": 0.2693694829940796,
"learning_rate": 0.0001,
"loss": 2.7406,
"step": 1232
},
{
"epoch": 0.8022121014964216,
"grad_norm": 0.2322738617658615,
"learning_rate": 0.0001,
"loss": 2.8483,
"step": 1233
},
{
"epoch": 0.8028627195836044,
"grad_norm": 0.21177823841571808,
"learning_rate": 0.0001,
"loss": 2.2315,
"step": 1234
},
{
"epoch": 0.8035133376707873,
"grad_norm": 0.2920454442501068,
"learning_rate": 0.0001,
"loss": 3.0264,
"step": 1235
},
{
"epoch": 0.8041639557579701,
"grad_norm": 0.2331319898366928,
"learning_rate": 0.0001,
"loss": 2.4574,
"step": 1236
},
{
"epoch": 0.8048145738451529,
"grad_norm": 0.2339990735054016,
"learning_rate": 0.0001,
"loss": 2.2752,
"step": 1237
},
{
"epoch": 0.8054651919323357,
"grad_norm": 0.22823981940746307,
"learning_rate": 0.0001,
"loss": 1.9615,
"step": 1238
},
{
"epoch": 0.8061158100195186,
"grad_norm": 0.20435038208961487,
"learning_rate": 0.0001,
"loss": 1.9989,
"step": 1239
},
{
"epoch": 0.8067664281067014,
"grad_norm": 0.32488611340522766,
"learning_rate": 0.0001,
"loss": 2.4791,
"step": 1240
},
{
"epoch": 0.8074170461938842,
"grad_norm": 0.27227675914764404,
"learning_rate": 0.0001,
"loss": 2.6443,
"step": 1241
},
{
"epoch": 0.808067664281067,
"grad_norm": 0.20864960551261902,
"learning_rate": 0.0001,
"loss": 2.2324,
"step": 1242
},
{
"epoch": 0.8087182823682498,
"grad_norm": 0.22645455598831177,
"learning_rate": 0.0001,
"loss": 2.0199,
"step": 1243
},
{
"epoch": 0.8093689004554326,
"grad_norm": 0.22091244161128998,
"learning_rate": 0.0001,
"loss": 2.1145,
"step": 1244
},
{
"epoch": 0.8100195185426154,
"grad_norm": 0.20442111790180206,
"learning_rate": 0.0001,
"loss": 2.1277,
"step": 1245
},
{
"epoch": 0.8106701366297983,
"grad_norm": 0.19400720298290253,
"learning_rate": 0.0001,
"loss": 1.951,
"step": 1246
},
{
"epoch": 0.8113207547169812,
"grad_norm": 0.474490225315094,
"learning_rate": 0.0001,
"loss": 3.0206,
"step": 1247
},
{
"epoch": 0.811971372804164,
"grad_norm": 0.23634073138237,
"learning_rate": 0.0001,
"loss": 2.2556,
"step": 1248
},
{
"epoch": 0.8126219908913468,
"grad_norm": 0.23998601734638214,
"learning_rate": 0.0001,
"loss": 2.3201,
"step": 1249
},
{
"epoch": 0.8132726089785296,
"grad_norm": 0.19258932769298553,
"learning_rate": 0.0001,
"loss": 1.9719,
"step": 1250
},
{
"epoch": 0.8139232270657124,
"grad_norm": 0.21039240062236786,
"learning_rate": 0.0001,
"loss": 2.3617,
"step": 1251
},
{
"epoch": 0.8145738451528952,
"grad_norm": 0.37176814675331116,
"learning_rate": 0.0001,
"loss": 2.7183,
"step": 1252
},
{
"epoch": 0.815224463240078,
"grad_norm": 0.24739331007003784,
"learning_rate": 0.0001,
"loss": 2.0098,
"step": 1253
},
{
"epoch": 0.8158750813272609,
"grad_norm": 0.32313254475593567,
"learning_rate": 0.0001,
"loss": 2.062,
"step": 1254
},
{
"epoch": 0.8165256994144438,
"grad_norm": 0.2571156322956085,
"learning_rate": 0.0001,
"loss": 2.3973,
"step": 1255
},
{
"epoch": 0.8171763175016266,
"grad_norm": 0.266369491815567,
"learning_rate": 0.0001,
"loss": 2.6019,
"step": 1256
},
{
"epoch": 0.8178269355888094,
"grad_norm": 0.3770993649959564,
"learning_rate": 0.0001,
"loss": 2.5413,
"step": 1257
},
{
"epoch": 0.8184775536759922,
"grad_norm": 0.24964609742164612,
"learning_rate": 0.0001,
"loss": 1.7407,
"step": 1258
},
{
"epoch": 0.819128171763175,
"grad_norm": 0.208835169672966,
"learning_rate": 0.0001,
"loss": 2.36,
"step": 1259
},
{
"epoch": 0.8197787898503578,
"grad_norm": 0.19789732992649078,
"learning_rate": 0.0001,
"loss": 2.0967,
"step": 1260
},
{
"epoch": 0.8204294079375407,
"grad_norm": 0.4847930669784546,
"learning_rate": 0.0001,
"loss": 2.9673,
"step": 1261
},
{
"epoch": 0.8210800260247235,
"grad_norm": 0.277960866689682,
"learning_rate": 0.0001,
"loss": 2.2165,
"step": 1262
},
{
"epoch": 0.8217306441119063,
"grad_norm": 0.20278669893741608,
"learning_rate": 0.0001,
"loss": 2.5098,
"step": 1263
},
{
"epoch": 0.8223812621990891,
"grad_norm": 0.3295345604419708,
"learning_rate": 0.0001,
"loss": 2.4451,
"step": 1264
},
{
"epoch": 0.8230318802862719,
"grad_norm": 0.25482621788978577,
"learning_rate": 0.0001,
"loss": 2.3178,
"step": 1265
},
{
"epoch": 0.8236824983734548,
"grad_norm": 0.21955101191997528,
"learning_rate": 0.0001,
"loss": 2.3245,
"step": 1266
},
{
"epoch": 0.8243331164606376,
"grad_norm": 0.19811898469924927,
"learning_rate": 0.0001,
"loss": 2.1608,
"step": 1267
},
{
"epoch": 0.8249837345478205,
"grad_norm": 0.20357833802700043,
"learning_rate": 0.0001,
"loss": 2.0502,
"step": 1268
},
{
"epoch": 0.8256343526350033,
"grad_norm": 0.25111669301986694,
"learning_rate": 0.0001,
"loss": 2.9059,
"step": 1269
},
{
"epoch": 0.8262849707221861,
"grad_norm": 0.20970256626605988,
"learning_rate": 0.0001,
"loss": 2.3496,
"step": 1270
},
{
"epoch": 0.8269355888093689,
"grad_norm": 0.19146494567394257,
"learning_rate": 0.0001,
"loss": 2.0773,
"step": 1271
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.2083313763141632,
"learning_rate": 0.0001,
"loss": 2.0031,
"step": 1272
},
{
"epoch": 0.8282368249837345,
"grad_norm": 0.19460196793079376,
"learning_rate": 0.0001,
"loss": 2.0411,
"step": 1273
},
{
"epoch": 0.8288874430709173,
"grad_norm": 0.1900896281003952,
"learning_rate": 0.0001,
"loss": 1.9517,
"step": 1274
},
{
"epoch": 0.8295380611581002,
"grad_norm": 0.20020513236522675,
"learning_rate": 0.0001,
"loss": 2.2062,
"step": 1275
},
{
"epoch": 0.8301886792452831,
"grad_norm": 0.21990856528282166,
"learning_rate": 0.0001,
"loss": 2.0837,
"step": 1276
},
{
"epoch": 0.8308392973324659,
"grad_norm": 0.1966349482536316,
"learning_rate": 0.0001,
"loss": 2.0407,
"step": 1277
},
{
"epoch": 0.8314899154196487,
"grad_norm": 0.19897864758968353,
"learning_rate": 0.0001,
"loss": 2.1639,
"step": 1278
},
{
"epoch": 0.8321405335068315,
"grad_norm": 0.21094024181365967,
"learning_rate": 0.0001,
"loss": 2.2158,
"step": 1279
},
{
"epoch": 0.8327911515940143,
"grad_norm": 0.1989631950855255,
"learning_rate": 0.0001,
"loss": 1.9578,
"step": 1280
},
{
"epoch": 0.8334417696811971,
"grad_norm": 0.1953240931034088,
"learning_rate": 0.0001,
"loss": 2.0365,
"step": 1281
},
{
"epoch": 0.8340923877683799,
"grad_norm": 0.33914485573768616,
"learning_rate": 0.0001,
"loss": 2.3676,
"step": 1282
},
{
"epoch": 0.8347430058555628,
"grad_norm": 0.17135807871818542,
"learning_rate": 0.0001,
"loss": 1.821,
"step": 1283
},
{
"epoch": 0.8353936239427456,
"grad_norm": 0.1993912309408188,
"learning_rate": 0.0001,
"loss": 2.4103,
"step": 1284
},
{
"epoch": 0.8360442420299284,
"grad_norm": 0.21222157776355743,
"learning_rate": 0.0001,
"loss": 2.3443,
"step": 1285
},
{
"epoch": 0.8366948601171112,
"grad_norm": 0.22162573039531708,
"learning_rate": 0.0001,
"loss": 2.1757,
"step": 1286
},
{
"epoch": 0.8373454782042941,
"grad_norm": 0.22677986323833466,
"learning_rate": 0.0001,
"loss": 2.0542,
"step": 1287
},
{
"epoch": 0.8379960962914769,
"grad_norm": 0.1974060982465744,
"learning_rate": 0.0001,
"loss": 2.1686,
"step": 1288
},
{
"epoch": 0.8386467143786597,
"grad_norm": 0.30552592873573303,
"learning_rate": 0.0001,
"loss": 2.5467,
"step": 1289
},
{
"epoch": 0.8392973324658426,
"grad_norm": 0.24357165396213531,
"learning_rate": 0.0001,
"loss": 2.3276,
"step": 1290
},
{
"epoch": 0.8399479505530254,
"grad_norm": 0.1960456818342209,
"learning_rate": 0.0001,
"loss": 2.0956,
"step": 1291
},
{
"epoch": 0.8405985686402082,
"grad_norm": 0.24264569580554962,
"learning_rate": 0.0001,
"loss": 2.0666,
"step": 1292
},
{
"epoch": 0.841249186727391,
"grad_norm": 0.25320202112197876,
"learning_rate": 0.0001,
"loss": 2.033,
"step": 1293
},
{
"epoch": 0.8418998048145738,
"grad_norm": 0.2313191145658493,
"learning_rate": 0.0001,
"loss": 2.0571,
"step": 1294
},
{
"epoch": 0.8425504229017566,
"grad_norm": 0.42846229672431946,
"learning_rate": 0.0001,
"loss": 2.0875,
"step": 1295
},
{
"epoch": 0.8432010409889394,
"grad_norm": 0.19277000427246094,
"learning_rate": 0.0001,
"loss": 1.9303,
"step": 1296
},
{
"epoch": 0.8438516590761224,
"grad_norm": 0.1947111338376999,
"learning_rate": 0.0001,
"loss": 1.9482,
"step": 1297
},
{
"epoch": 0.8445022771633052,
"grad_norm": 0.30196627974510193,
"learning_rate": 0.0001,
"loss": 2.3238,
"step": 1298
},
{
"epoch": 0.845152895250488,
"grad_norm": 0.21137486398220062,
"learning_rate": 0.0001,
"loss": 2.1962,
"step": 1299
},
{
"epoch": 0.8458035133376708,
"grad_norm": 0.2568284571170807,
"learning_rate": 0.0001,
"loss": 2.3231,
"step": 1300
},
{
"epoch": 0.8464541314248536,
"grad_norm": 0.2092464715242386,
"learning_rate": 0.0001,
"loss": 1.8074,
"step": 1301
},
{
"epoch": 0.8471047495120364,
"grad_norm": 0.2112191617488861,
"learning_rate": 0.0001,
"loss": 2.169,
"step": 1302
},
{
"epoch": 0.8477553675992192,
"grad_norm": 0.17425194382667542,
"learning_rate": 0.0001,
"loss": 1.8025,
"step": 1303
},
{
"epoch": 0.8484059856864021,
"grad_norm": 0.20808906853199005,
"learning_rate": 0.0001,
"loss": 2.0869,
"step": 1304
},
{
"epoch": 0.8490566037735849,
"grad_norm": 0.25200703740119934,
"learning_rate": 0.0001,
"loss": 2.4963,
"step": 1305
},
{
"epoch": 0.8497072218607677,
"grad_norm": 0.23948469758033752,
"learning_rate": 0.0001,
"loss": 2.3028,
"step": 1306
},
{
"epoch": 0.8503578399479506,
"grad_norm": 0.185250923037529,
"learning_rate": 0.0001,
"loss": 1.7409,
"step": 1307
},
{
"epoch": 0.8510084580351334,
"grad_norm": 0.18948182463645935,
"learning_rate": 0.0001,
"loss": 1.8922,
"step": 1308
},
{
"epoch": 0.8516590761223162,
"grad_norm": 0.2027200311422348,
"learning_rate": 0.0001,
"loss": 2.0922,
"step": 1309
},
{
"epoch": 0.852309694209499,
"grad_norm": 0.28325602412223816,
"learning_rate": 0.0001,
"loss": 2.0428,
"step": 1310
},
{
"epoch": 0.8529603122966819,
"grad_norm": 0.1829916387796402,
"learning_rate": 0.0001,
"loss": 1.9518,
"step": 1311
},
{
"epoch": 0.8536109303838647,
"grad_norm": 0.1982378512620926,
"learning_rate": 0.0001,
"loss": 2.0209,
"step": 1312
},
{
"epoch": 0.8542615484710475,
"grad_norm": 0.18915079534053802,
"learning_rate": 0.0001,
"loss": 1.9291,
"step": 1313
},
{
"epoch": 0.8549121665582303,
"grad_norm": 0.1832190752029419,
"learning_rate": 0.0001,
"loss": 1.9818,
"step": 1314
},
{
"epoch": 0.8555627846454131,
"grad_norm": 0.2646237313747406,
"learning_rate": 0.0001,
"loss": 2.4418,
"step": 1315
},
{
"epoch": 0.8562134027325959,
"grad_norm": 0.2831929326057434,
"learning_rate": 0.0001,
"loss": 2.8355,
"step": 1316
},
{
"epoch": 0.8568640208197787,
"grad_norm": 0.2711881995201111,
"learning_rate": 0.0001,
"loss": 2.1963,
"step": 1317
},
{
"epoch": 0.8575146389069617,
"grad_norm": 0.25786513090133667,
"learning_rate": 0.0001,
"loss": 3.002,
"step": 1318
},
{
"epoch": 0.8581652569941445,
"grad_norm": 0.26838061213493347,
"learning_rate": 0.0001,
"loss": 3.1155,
"step": 1319
},
{
"epoch": 0.8588158750813273,
"grad_norm": 0.2220889776945114,
"learning_rate": 0.0001,
"loss": 2.0535,
"step": 1320
},
{
"epoch": 0.8594664931685101,
"grad_norm": 0.2008647471666336,
"learning_rate": 0.0001,
"loss": 2.0515,
"step": 1321
},
{
"epoch": 0.8601171112556929,
"grad_norm": 0.22017711400985718,
"learning_rate": 0.0001,
"loss": 2.289,
"step": 1322
},
{
"epoch": 0.8607677293428757,
"grad_norm": 0.19674621522426605,
"learning_rate": 0.0001,
"loss": 1.9414,
"step": 1323
},
{
"epoch": 0.8614183474300585,
"grad_norm": 0.191552072763443,
"learning_rate": 0.0001,
"loss": 1.9939,
"step": 1324
},
{
"epoch": 0.8620689655172413,
"grad_norm": 0.20212143659591675,
"learning_rate": 0.0001,
"loss": 1.8938,
"step": 1325
},
{
"epoch": 0.8627195836044242,
"grad_norm": 0.22502020001411438,
"learning_rate": 0.0001,
"loss": 2.13,
"step": 1326
},
{
"epoch": 0.863370201691607,
"grad_norm": 0.2504305839538574,
"learning_rate": 0.0001,
"loss": 2.7666,
"step": 1327
},
{
"epoch": 0.8640208197787899,
"grad_norm": 0.19481819868087769,
"learning_rate": 0.0001,
"loss": 2.1141,
"step": 1328
},
{
"epoch": 0.8646714378659727,
"grad_norm": 0.21994583308696747,
"learning_rate": 0.0001,
"loss": 2.7615,
"step": 1329
},
{
"epoch": 0.8653220559531555,
"grad_norm": 0.19281654059886932,
"learning_rate": 0.0001,
"loss": 2.0864,
"step": 1330
},
{
"epoch": 0.8659726740403383,
"grad_norm": 0.20329228043556213,
"learning_rate": 0.0001,
"loss": 2.1002,
"step": 1331
},
{
"epoch": 0.8666232921275211,
"grad_norm": 0.19484490156173706,
"learning_rate": 0.0001,
"loss": 2.0519,
"step": 1332
},
{
"epoch": 0.867273910214704,
"grad_norm": 0.1867295801639557,
"learning_rate": 0.0001,
"loss": 1.9208,
"step": 1333
},
{
"epoch": 0.8679245283018868,
"grad_norm": 0.30128392577171326,
"learning_rate": 0.0001,
"loss": 2.7527,
"step": 1334
},
{
"epoch": 0.8685751463890696,
"grad_norm": 0.22880543768405914,
"learning_rate": 0.0001,
"loss": 2.449,
"step": 1335
},
{
"epoch": 0.8692257644762524,
"grad_norm": 0.23333753645420074,
"learning_rate": 0.0001,
"loss": 2.0425,
"step": 1336
},
{
"epoch": 0.8698763825634352,
"grad_norm": 0.34176793694496155,
"learning_rate": 0.0001,
"loss": 2.8857,
"step": 1337
},
{
"epoch": 0.870527000650618,
"grad_norm": 0.19983690977096558,
"learning_rate": 0.0001,
"loss": 2.2466,
"step": 1338
},
{
"epoch": 0.8711776187378009,
"grad_norm": 0.21883231401443481,
"learning_rate": 0.0001,
"loss": 2.1262,
"step": 1339
},
{
"epoch": 0.8718282368249838,
"grad_norm": 0.19143971800804138,
"learning_rate": 0.0001,
"loss": 2.0119,
"step": 1340
},
{
"epoch": 0.8724788549121666,
"grad_norm": 0.25845617055892944,
"learning_rate": 0.0001,
"loss": 2.6315,
"step": 1341
},
{
"epoch": 0.8731294729993494,
"grad_norm": 0.1914021521806717,
"learning_rate": 0.0001,
"loss": 1.8571,
"step": 1342
},
{
"epoch": 0.8737800910865322,
"grad_norm": 0.2742185592651367,
"learning_rate": 0.0001,
"loss": 2.1467,
"step": 1343
},
{
"epoch": 0.874430709173715,
"grad_norm": 0.19927754998207092,
"learning_rate": 0.0001,
"loss": 1.9877,
"step": 1344
},
{
"epoch": 0.8750813272608978,
"grad_norm": 0.2340778261423111,
"learning_rate": 0.0001,
"loss": 2.2476,
"step": 1345
},
{
"epoch": 0.8757319453480806,
"grad_norm": 0.2931828498840332,
"learning_rate": 0.0001,
"loss": 2.4643,
"step": 1346
},
{
"epoch": 0.8763825634352636,
"grad_norm": 0.18637506663799286,
"learning_rate": 0.0001,
"loss": 1.7933,
"step": 1347
},
{
"epoch": 0.8770331815224464,
"grad_norm": 0.1898747682571411,
"learning_rate": 0.0001,
"loss": 1.9781,
"step": 1348
},
{
"epoch": 0.8776837996096292,
"grad_norm": 0.229608952999115,
"learning_rate": 0.0001,
"loss": 2.1293,
"step": 1349
},
{
"epoch": 0.878334417696812,
"grad_norm": 0.31374409794807434,
"learning_rate": 0.0001,
"loss": 2.5436,
"step": 1350
},
{
"epoch": 0.8789850357839948,
"grad_norm": 0.22544679045677185,
"learning_rate": 0.0001,
"loss": 2.0882,
"step": 1351
},
{
"epoch": 0.8796356538711776,
"grad_norm": 0.2415180653333664,
"learning_rate": 0.0001,
"loss": 2.3193,
"step": 1352
},
{
"epoch": 0.8802862719583604,
"grad_norm": 0.28355568647384644,
"learning_rate": 0.0001,
"loss": 2.5994,
"step": 1353
},
{
"epoch": 0.8809368900455433,
"grad_norm": 0.19143925607204437,
"learning_rate": 0.0001,
"loss": 2.0546,
"step": 1354
},
{
"epoch": 0.8815875081327261,
"grad_norm": 0.2990890443325043,
"learning_rate": 0.0001,
"loss": 2.7388,
"step": 1355
},
{
"epoch": 0.8822381262199089,
"grad_norm": 0.28672561049461365,
"learning_rate": 0.0001,
"loss": 1.915,
"step": 1356
},
{
"epoch": 0.8828887443070917,
"grad_norm": 0.20137082040309906,
"learning_rate": 0.0001,
"loss": 2.5376,
"step": 1357
},
{
"epoch": 0.8835393623942746,
"grad_norm": 0.2175220251083374,
"learning_rate": 0.0001,
"loss": 1.9055,
"step": 1358
},
{
"epoch": 0.8841899804814574,
"grad_norm": 0.2790168523788452,
"learning_rate": 0.0001,
"loss": 2.0223,
"step": 1359
},
{
"epoch": 0.8848405985686402,
"grad_norm": 0.22070975601673126,
"learning_rate": 0.0001,
"loss": 2.4071,
"step": 1360
},
{
"epoch": 0.885491216655823,
"grad_norm": 0.22505122423171997,
"learning_rate": 0.0001,
"loss": 2.2988,
"step": 1361
},
{
"epoch": 0.8861418347430059,
"grad_norm": 0.2231319099664688,
"learning_rate": 0.0001,
"loss": 2.0156,
"step": 1362
},
{
"epoch": 0.8867924528301887,
"grad_norm": 0.2921566665172577,
"learning_rate": 0.0001,
"loss": 2.7166,
"step": 1363
},
{
"epoch": 0.8874430709173715,
"grad_norm": 0.19267822802066803,
"learning_rate": 0.0001,
"loss": 2.0485,
"step": 1364
},
{
"epoch": 0.8880936890045543,
"grad_norm": 0.28789597749710083,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 1365
},
{
"epoch": 0.8887443070917371,
"grad_norm": 0.3205803334712982,
"learning_rate": 0.0001,
"loss": 2.5545,
"step": 1366
},
{
"epoch": 0.8893949251789199,
"grad_norm": 0.20888707041740417,
"learning_rate": 0.0001,
"loss": 1.8906,
"step": 1367
},
{
"epoch": 0.8900455432661027,
"grad_norm": 0.18200016021728516,
"learning_rate": 0.0001,
"loss": 1.8483,
"step": 1368
},
{
"epoch": 0.8906961613532857,
"grad_norm": 0.2367328256368637,
"learning_rate": 0.0001,
"loss": 2.3351,
"step": 1369
},
{
"epoch": 0.8913467794404685,
"grad_norm": 0.28111082315444946,
"learning_rate": 0.0001,
"loss": 2.5511,
"step": 1370
},
{
"epoch": 0.8919973975276513,
"grad_norm": 0.19744041562080383,
"learning_rate": 0.0001,
"loss": 1.9521,
"step": 1371
},
{
"epoch": 0.8926480156148341,
"grad_norm": 0.2166965901851654,
"learning_rate": 0.0001,
"loss": 2.1205,
"step": 1372
},
{
"epoch": 0.8932986337020169,
"grad_norm": 0.20931009948253632,
"learning_rate": 0.0001,
"loss": 2.1394,
"step": 1373
},
{
"epoch": 0.8939492517891997,
"grad_norm": 0.2102230191230774,
"learning_rate": 0.0001,
"loss": 1.9695,
"step": 1374
},
{
"epoch": 0.8945998698763825,
"grad_norm": 0.22161559760570526,
"learning_rate": 0.0001,
"loss": 2.4084,
"step": 1375
},
{
"epoch": 0.8952504879635654,
"grad_norm": 0.22104842960834503,
"learning_rate": 0.0001,
"loss": 2.6029,
"step": 1376
},
{
"epoch": 0.8959011060507482,
"grad_norm": 0.2125016152858734,
"learning_rate": 0.0001,
"loss": 2.0576,
"step": 1377
},
{
"epoch": 0.896551724137931,
"grad_norm": 0.2626838684082031,
"learning_rate": 0.0001,
"loss": 2.5907,
"step": 1378
},
{
"epoch": 0.8972023422251139,
"grad_norm": 0.19114330410957336,
"learning_rate": 0.0001,
"loss": 2.2824,
"step": 1379
},
{
"epoch": 0.8978529603122967,
"grad_norm": 0.24731865525245667,
"learning_rate": 0.0001,
"loss": 2.5292,
"step": 1380
},
{
"epoch": 0.8985035783994795,
"grad_norm": 0.23787495493888855,
"learning_rate": 0.0001,
"loss": 2.1433,
"step": 1381
},
{
"epoch": 0.8991541964866623,
"grad_norm": 0.2028874158859253,
"learning_rate": 0.0001,
"loss": 2.2726,
"step": 1382
},
{
"epoch": 0.8998048145738452,
"grad_norm": 0.22940067946910858,
"learning_rate": 0.0001,
"loss": 2.3222,
"step": 1383
},
{
"epoch": 0.900455432661028,
"grad_norm": 0.20267997682094574,
"learning_rate": 0.0001,
"loss": 2.2875,
"step": 1384
},
{
"epoch": 0.9011060507482108,
"grad_norm": 0.21694517135620117,
"learning_rate": 0.0001,
"loss": 2.3674,
"step": 1385
},
{
"epoch": 0.9017566688353936,
"grad_norm": 0.1904231160879135,
"learning_rate": 0.0001,
"loss": 1.996,
"step": 1386
},
{
"epoch": 0.9024072869225764,
"grad_norm": 0.2630701959133148,
"learning_rate": 0.0001,
"loss": 2.4881,
"step": 1387
},
{
"epoch": 0.9030579050097592,
"grad_norm": 0.19993318617343903,
"learning_rate": 0.0001,
"loss": 1.9409,
"step": 1388
},
{
"epoch": 0.903708523096942,
"grad_norm": 0.19389230012893677,
"learning_rate": 0.0001,
"loss": 2.1121,
"step": 1389
},
{
"epoch": 0.904359141184125,
"grad_norm": 0.20352298021316528,
"learning_rate": 0.0001,
"loss": 1.9887,
"step": 1390
},
{
"epoch": 0.9050097592713078,
"grad_norm": 0.17967310547828674,
"learning_rate": 0.0001,
"loss": 1.8068,
"step": 1391
},
{
"epoch": 0.9056603773584906,
"grad_norm": 0.2310938984155655,
"learning_rate": 0.0001,
"loss": 2.2666,
"step": 1392
},
{
"epoch": 0.9063109954456734,
"grad_norm": 0.18979041278362274,
"learning_rate": 0.0001,
"loss": 2.0004,
"step": 1393
},
{
"epoch": 0.9069616135328562,
"grad_norm": 0.26813068985939026,
"learning_rate": 0.0001,
"loss": 2.4142,
"step": 1394
},
{
"epoch": 0.907612231620039,
"grad_norm": 0.23549699783325195,
"learning_rate": 0.0001,
"loss": 2.3059,
"step": 1395
},
{
"epoch": 0.9082628497072218,
"grad_norm": 0.2435377985239029,
"learning_rate": 0.0001,
"loss": 2.1919,
"step": 1396
},
{
"epoch": 0.9089134677944047,
"grad_norm": 0.21723680198192596,
"learning_rate": 0.0001,
"loss": 2.2244,
"step": 1397
},
{
"epoch": 0.9095640858815875,
"grad_norm": 0.20665475726127625,
"learning_rate": 0.0001,
"loss": 2.1907,
"step": 1398
},
{
"epoch": 0.9102147039687704,
"grad_norm": 0.26172783970832825,
"learning_rate": 0.0001,
"loss": 2.5632,
"step": 1399
},
{
"epoch": 0.9108653220559532,
"grad_norm": 0.22065763175487518,
"learning_rate": 0.0001,
"loss": 2.287,
"step": 1400
},
{
"epoch": 0.911515940143136,
"grad_norm": 0.260623574256897,
"learning_rate": 0.0001,
"loss": 2.7247,
"step": 1401
},
{
"epoch": 0.9121665582303188,
"grad_norm": 0.1967797726392746,
"learning_rate": 0.0001,
"loss": 2.3431,
"step": 1402
},
{
"epoch": 0.9128171763175016,
"grad_norm": 0.19779254496097565,
"learning_rate": 0.0001,
"loss": 1.8389,
"step": 1403
},
{
"epoch": 0.9134677944046844,
"grad_norm": 0.20970992743968964,
"learning_rate": 0.0001,
"loss": 2.1884,
"step": 1404
},
{
"epoch": 0.9141184124918673,
"grad_norm": 0.22229008376598358,
"learning_rate": 0.0001,
"loss": 2.2673,
"step": 1405
},
{
"epoch": 0.9147690305790501,
"grad_norm": 0.2208055853843689,
"learning_rate": 0.0001,
"loss": 2.1967,
"step": 1406
},
{
"epoch": 0.9154196486662329,
"grad_norm": 0.2209876924753189,
"learning_rate": 0.0001,
"loss": 2.2027,
"step": 1407
},
{
"epoch": 0.9160702667534157,
"grad_norm": 0.19158391654491425,
"learning_rate": 0.0001,
"loss": 1.9069,
"step": 1408
},
{
"epoch": 0.9167208848405985,
"grad_norm": 0.2156110256910324,
"learning_rate": 0.0001,
"loss": 2.2712,
"step": 1409
},
{
"epoch": 0.9173715029277814,
"grad_norm": 0.2610962390899658,
"learning_rate": 0.0001,
"loss": 1.8294,
"step": 1410
},
{
"epoch": 0.9180221210149642,
"grad_norm": 0.18197974562644958,
"learning_rate": 0.0001,
"loss": 1.9715,
"step": 1411
},
{
"epoch": 0.9186727391021471,
"grad_norm": 0.19082801043987274,
"learning_rate": 0.0001,
"loss": 2.1091,
"step": 1412
},
{
"epoch": 0.9193233571893299,
"grad_norm": 0.26832160353660583,
"learning_rate": 0.0001,
"loss": 2.7021,
"step": 1413
},
{
"epoch": 0.9199739752765127,
"grad_norm": 0.3070698082447052,
"learning_rate": 0.0001,
"loss": 2.4547,
"step": 1414
},
{
"epoch": 0.9206245933636955,
"grad_norm": 0.25139206647872925,
"learning_rate": 0.0001,
"loss": 2.5873,
"step": 1415
},
{
"epoch": 0.9212752114508783,
"grad_norm": 0.2131306529045105,
"learning_rate": 0.0001,
"loss": 2.3841,
"step": 1416
},
{
"epoch": 0.9219258295380611,
"grad_norm": 0.24531540274620056,
"learning_rate": 0.0001,
"loss": 1.9666,
"step": 1417
},
{
"epoch": 0.9225764476252439,
"grad_norm": 0.1986437737941742,
"learning_rate": 0.0001,
"loss": 1.9241,
"step": 1418
},
{
"epoch": 0.9232270657124269,
"grad_norm": 0.23614904284477234,
"learning_rate": 0.0001,
"loss": 2.5824,
"step": 1419
},
{
"epoch": 0.9238776837996097,
"grad_norm": 0.2782133221626282,
"learning_rate": 0.0001,
"loss": 2.1812,
"step": 1420
},
{
"epoch": 0.9245283018867925,
"grad_norm": 0.2232246845960617,
"learning_rate": 0.0001,
"loss": 2.3204,
"step": 1421
},
{
"epoch": 0.9251789199739753,
"grad_norm": 0.22002846002578735,
"learning_rate": 0.0001,
"loss": 1.8228,
"step": 1422
},
{
"epoch": 0.9258295380611581,
"grad_norm": 0.30900144577026367,
"learning_rate": 0.0001,
"loss": 2.4824,
"step": 1423
},
{
"epoch": 0.9264801561483409,
"grad_norm": 0.262989342212677,
"learning_rate": 0.0001,
"loss": 2.8719,
"step": 1424
},
{
"epoch": 0.9271307742355237,
"grad_norm": 0.5406531095504761,
"learning_rate": 0.0001,
"loss": 2.6984,
"step": 1425
},
{
"epoch": 0.9277813923227066,
"grad_norm": 0.2415890246629715,
"learning_rate": 0.0001,
"loss": 2.2543,
"step": 1426
},
{
"epoch": 0.9284320104098894,
"grad_norm": 0.21261392533779144,
"learning_rate": 0.0001,
"loss": 1.9761,
"step": 1427
},
{
"epoch": 0.9290826284970722,
"grad_norm": 0.23223569989204407,
"learning_rate": 0.0001,
"loss": 1.821,
"step": 1428
},
{
"epoch": 0.929733246584255,
"grad_norm": 0.2846924960613251,
"learning_rate": 0.0001,
"loss": 1.9886,
"step": 1429
},
{
"epoch": 0.9303838646714379,
"grad_norm": 0.2527627646923065,
"learning_rate": 0.0001,
"loss": 2.373,
"step": 1430
},
{
"epoch": 0.9310344827586207,
"grad_norm": 0.19917793571949005,
"learning_rate": 0.0001,
"loss": 2.0111,
"step": 1431
},
{
"epoch": 0.9316851008458035,
"grad_norm": 0.19021449983119965,
"learning_rate": 0.0001,
"loss": 2.0373,
"step": 1432
},
{
"epoch": 0.9323357189329864,
"grad_norm": 0.24929922819137573,
"learning_rate": 0.0001,
"loss": 2.3885,
"step": 1433
},
{
"epoch": 0.9329863370201692,
"grad_norm": 0.2533571124076843,
"learning_rate": 0.0001,
"loss": 2.544,
"step": 1434
},
{
"epoch": 0.933636955107352,
"grad_norm": 0.23931783437728882,
"learning_rate": 0.0001,
"loss": 2.418,
"step": 1435
},
{
"epoch": 0.9342875731945348,
"grad_norm": 0.30167070031166077,
"learning_rate": 0.0001,
"loss": 2.6513,
"step": 1436
},
{
"epoch": 0.9349381912817176,
"grad_norm": 0.1971869319677353,
"learning_rate": 0.0001,
"loss": 2.4016,
"step": 1437
},
{
"epoch": 0.9355888093689004,
"grad_norm": 0.21331265568733215,
"learning_rate": 0.0001,
"loss": 2.1524,
"step": 1438
},
{
"epoch": 0.9362394274560832,
"grad_norm": 0.26298433542251587,
"learning_rate": 0.0001,
"loss": 2.9442,
"step": 1439
},
{
"epoch": 0.936890045543266,
"grad_norm": 0.245792955160141,
"learning_rate": 0.0001,
"loss": 2.2055,
"step": 1440
},
{
"epoch": 0.937540663630449,
"grad_norm": 0.23703397810459137,
"learning_rate": 0.0001,
"loss": 2.5616,
"step": 1441
},
{
"epoch": 0.9381912817176318,
"grad_norm": 0.18641355633735657,
"learning_rate": 0.0001,
"loss": 1.8982,
"step": 1442
},
{
"epoch": 0.9388418998048146,
"grad_norm": 0.3551875650882721,
"learning_rate": 0.0001,
"loss": 2.7802,
"step": 1443
},
{
"epoch": 0.9394925178919974,
"grad_norm": 0.2278834879398346,
"learning_rate": 0.0001,
"loss": 2.175,
"step": 1444
},
{
"epoch": 0.9401431359791802,
"grad_norm": 0.26398956775665283,
"learning_rate": 0.0001,
"loss": 2.643,
"step": 1445
},
{
"epoch": 0.940793754066363,
"grad_norm": 0.31316065788269043,
"learning_rate": 0.0001,
"loss": 2.5662,
"step": 1446
},
{
"epoch": 0.9414443721535458,
"grad_norm": 0.22769761085510254,
"learning_rate": 0.0001,
"loss": 2.8677,
"step": 1447
},
{
"epoch": 0.9420949902407287,
"grad_norm": 0.2069929838180542,
"learning_rate": 0.0001,
"loss": 2.4393,
"step": 1448
},
{
"epoch": 0.9427456083279115,
"grad_norm": 0.23500226438045502,
"learning_rate": 0.0001,
"loss": 2.0914,
"step": 1449
},
{
"epoch": 0.9433962264150944,
"grad_norm": 0.2312425971031189,
"learning_rate": 0.0001,
"loss": 2.1085,
"step": 1450
},
{
"epoch": 0.9440468445022772,
"grad_norm": 0.20859290659427643,
"learning_rate": 0.0001,
"loss": 2.0653,
"step": 1451
},
{
"epoch": 0.94469746258946,
"grad_norm": 0.23336270451545715,
"learning_rate": 0.0001,
"loss": 2.1047,
"step": 1452
},
{
"epoch": 0.9453480806766428,
"grad_norm": 0.2613270580768585,
"learning_rate": 0.0001,
"loss": 2.3179,
"step": 1453
},
{
"epoch": 0.9459986987638256,
"grad_norm": 0.2182740867137909,
"learning_rate": 0.0001,
"loss": 2.0625,
"step": 1454
},
{
"epoch": 0.9466493168510085,
"grad_norm": 0.28436079621315,
"learning_rate": 0.0001,
"loss": 1.8766,
"step": 1455
},
{
"epoch": 0.9472999349381913,
"grad_norm": 0.1998225450515747,
"learning_rate": 0.0001,
"loss": 2.3157,
"step": 1456
},
{
"epoch": 0.9479505530253741,
"grad_norm": 0.19695498049259186,
"learning_rate": 0.0001,
"loss": 1.7501,
"step": 1457
},
{
"epoch": 0.9486011711125569,
"grad_norm": 0.1972542405128479,
"learning_rate": 0.0001,
"loss": 1.956,
"step": 1458
},
{
"epoch": 0.9492517891997397,
"grad_norm": 0.18410329520702362,
"learning_rate": 0.0001,
"loss": 1.8403,
"step": 1459
},
{
"epoch": 0.9499024072869225,
"grad_norm": 0.3675645887851715,
"learning_rate": 0.0001,
"loss": 2.9161,
"step": 1460
},
{
"epoch": 0.9505530253741054,
"grad_norm": 0.2620394229888916,
"learning_rate": 0.0001,
"loss": 2.4318,
"step": 1461
},
{
"epoch": 0.9512036434612883,
"grad_norm": 0.28973767161369324,
"learning_rate": 0.0001,
"loss": 2.0047,
"step": 1462
},
{
"epoch": 0.9518542615484711,
"grad_norm": 0.31598249077796936,
"learning_rate": 0.0001,
"loss": 2.4517,
"step": 1463
},
{
"epoch": 0.9525048796356539,
"grad_norm": 0.18546514213085175,
"learning_rate": 0.0001,
"loss": 1.8551,
"step": 1464
},
{
"epoch": 0.9531554977228367,
"grad_norm": 0.32123416662216187,
"learning_rate": 0.0001,
"loss": 2.7277,
"step": 1465
},
{
"epoch": 0.9538061158100195,
"grad_norm": 0.25180497765541077,
"learning_rate": 0.0001,
"loss": 1.7946,
"step": 1466
},
{
"epoch": 0.9544567338972023,
"grad_norm": 0.24950966238975525,
"learning_rate": 0.0001,
"loss": 2.0796,
"step": 1467
},
{
"epoch": 0.9551073519843851,
"grad_norm": 0.20496372878551483,
"learning_rate": 0.0001,
"loss": 2.0713,
"step": 1468
},
{
"epoch": 0.955757970071568,
"grad_norm": 0.20856817066669464,
"learning_rate": 0.0001,
"loss": 2.1812,
"step": 1469
},
{
"epoch": 0.9564085881587508,
"grad_norm": 0.26053234934806824,
"learning_rate": 0.0001,
"loss": 2.3234,
"step": 1470
},
{
"epoch": 0.9570592062459337,
"grad_norm": 0.3086039125919342,
"learning_rate": 0.0001,
"loss": 2.3745,
"step": 1471
},
{
"epoch": 0.9577098243331165,
"grad_norm": 0.19647593796253204,
"learning_rate": 0.0001,
"loss": 1.8883,
"step": 1472
},
{
"epoch": 0.9583604424202993,
"grad_norm": 0.20327430963516235,
"learning_rate": 0.0001,
"loss": 2.125,
"step": 1473
},
{
"epoch": 0.9590110605074821,
"grad_norm": 0.22550363838672638,
"learning_rate": 0.0001,
"loss": 2.1609,
"step": 1474
},
{
"epoch": 0.9596616785946649,
"grad_norm": 0.2369288206100464,
"learning_rate": 0.0001,
"loss": 1.9352,
"step": 1475
},
{
"epoch": 0.9603122966818478,
"grad_norm": 0.21195881068706512,
"learning_rate": 0.0001,
"loss": 2.0275,
"step": 1476
},
{
"epoch": 0.9609629147690306,
"grad_norm": 0.17060896754264832,
"learning_rate": 0.0001,
"loss": 1.9566,
"step": 1477
},
{
"epoch": 0.9616135328562134,
"grad_norm": 0.23335829377174377,
"learning_rate": 0.0001,
"loss": 2.296,
"step": 1478
},
{
"epoch": 0.9622641509433962,
"grad_norm": 0.34170275926589966,
"learning_rate": 0.0001,
"loss": 2.0079,
"step": 1479
},
{
"epoch": 0.962914769030579,
"grad_norm": 0.2187998741865158,
"learning_rate": 0.0001,
"loss": 2.0203,
"step": 1480
},
{
"epoch": 0.9635653871177619,
"grad_norm": 0.1877596378326416,
"learning_rate": 0.0001,
"loss": 1.9496,
"step": 1481
},
{
"epoch": 0.9642160052049447,
"grad_norm": 0.18515220284461975,
"learning_rate": 0.0001,
"loss": 2.0025,
"step": 1482
},
{
"epoch": 0.9648666232921275,
"grad_norm": 0.21251696348190308,
"learning_rate": 0.0001,
"loss": 1.8843,
"step": 1483
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.19280041754245758,
"learning_rate": 0.0001,
"loss": 2.0726,
"step": 1484
},
{
"epoch": 0.9661678594664932,
"grad_norm": 0.1977832317352295,
"learning_rate": 0.0001,
"loss": 2.0546,
"step": 1485
},
{
"epoch": 0.966818477553676,
"grad_norm": 0.19019471108913422,
"learning_rate": 0.0001,
"loss": 1.9825,
"step": 1486
},
{
"epoch": 0.9674690956408588,
"grad_norm": 0.20381596684455872,
"learning_rate": 0.0001,
"loss": 2.3339,
"step": 1487
},
{
"epoch": 0.9681197137280416,
"grad_norm": 0.1899532973766327,
"learning_rate": 0.0001,
"loss": 2.2962,
"step": 1488
},
{
"epoch": 0.9687703318152244,
"grad_norm": 0.20524102449417114,
"learning_rate": 0.0001,
"loss": 2.0874,
"step": 1489
},
{
"epoch": 0.9694209499024072,
"grad_norm": 0.179798424243927,
"learning_rate": 0.0001,
"loss": 1.8875,
"step": 1490
},
{
"epoch": 0.9700715679895902,
"grad_norm": 0.19358840584754944,
"learning_rate": 0.0001,
"loss": 2.1539,
"step": 1491
},
{
"epoch": 0.970722186076773,
"grad_norm": 0.2686682343482971,
"learning_rate": 0.0001,
"loss": 2.3412,
"step": 1492
},
{
"epoch": 0.9713728041639558,
"grad_norm": 0.2146061509847641,
"learning_rate": 0.0001,
"loss": 2.476,
"step": 1493
},
{
"epoch": 0.9720234222511386,
"grad_norm": 0.26737329363822937,
"learning_rate": 0.0001,
"loss": 2.8003,
"step": 1494
},
{
"epoch": 0.9726740403383214,
"grad_norm": 0.23344694077968597,
"learning_rate": 0.0001,
"loss": 2.1174,
"step": 1495
},
{
"epoch": 0.9733246584255042,
"grad_norm": 0.1991250365972519,
"learning_rate": 0.0001,
"loss": 2.5734,
"step": 1496
},
{
"epoch": 0.973975276512687,
"grad_norm": 0.21246576309204102,
"learning_rate": 0.0001,
"loss": 2.5597,
"step": 1497
},
{
"epoch": 0.9746258945998699,
"grad_norm": 0.1873084306716919,
"learning_rate": 0.0001,
"loss": 1.9547,
"step": 1498
},
{
"epoch": 0.9752765126870527,
"grad_norm": 0.17600129544734955,
"learning_rate": 0.0001,
"loss": 1.7255,
"step": 1499
},
{
"epoch": 0.9759271307742355,
"grad_norm": 0.19860287010669708,
"learning_rate": 0.0001,
"loss": 2.5043,
"step": 1500
},
{
"epoch": 0.9765777488614183,
"grad_norm": 0.1887977123260498,
"learning_rate": 0.0001,
"loss": 2.091,
"step": 1501
},
{
"epoch": 0.9772283669486012,
"grad_norm": 0.1981416791677475,
"learning_rate": 0.0001,
"loss": 1.968,
"step": 1502
},
{
"epoch": 0.977878985035784,
"grad_norm": 0.22598034143447876,
"learning_rate": 0.0001,
"loss": 2.2569,
"step": 1503
},
{
"epoch": 0.9785296031229668,
"grad_norm": 0.18924662470817566,
"learning_rate": 0.0001,
"loss": 2.3823,
"step": 1504
},
{
"epoch": 0.9791802212101497,
"grad_norm": 0.2178531438112259,
"learning_rate": 0.0001,
"loss": 2.0824,
"step": 1505
},
{
"epoch": 0.9798308392973325,
"grad_norm": 0.2125057578086853,
"learning_rate": 0.0001,
"loss": 2.196,
"step": 1506
},
{
"epoch": 0.9804814573845153,
"grad_norm": 0.19958944618701935,
"learning_rate": 0.0001,
"loss": 1.8752,
"step": 1507
},
{
"epoch": 0.9811320754716981,
"grad_norm": 0.23179121315479279,
"learning_rate": 0.0001,
"loss": 2.0539,
"step": 1508
},
{
"epoch": 0.9817826935588809,
"grad_norm": 0.19006481766700745,
"learning_rate": 0.0001,
"loss": 2.0125,
"step": 1509
},
{
"epoch": 0.9824333116460637,
"grad_norm": 0.1952325403690338,
"learning_rate": 0.0001,
"loss": 2.1829,
"step": 1510
},
{
"epoch": 0.9830839297332465,
"grad_norm": 0.24362123012542725,
"learning_rate": 0.0001,
"loss": 2.4628,
"step": 1511
},
{
"epoch": 0.9837345478204295,
"grad_norm": 0.20148973166942596,
"learning_rate": 0.0001,
"loss": 1.9869,
"step": 1512
},
{
"epoch": 0.9843851659076123,
"grad_norm": 0.19783656299114227,
"learning_rate": 0.0001,
"loss": 2.1447,
"step": 1513
},
{
"epoch": 0.9850357839947951,
"grad_norm": 0.2120031863451004,
"learning_rate": 0.0001,
"loss": 2.1149,
"step": 1514
},
{
"epoch": 0.9856864020819779,
"grad_norm": 0.2673274278640747,
"learning_rate": 0.0001,
"loss": 2.3755,
"step": 1515
},
{
"epoch": 0.9863370201691607,
"grad_norm": 0.31493106484413147,
"learning_rate": 0.0001,
"loss": 2.8462,
"step": 1516
},
{
"epoch": 0.9869876382563435,
"grad_norm": 0.24251258373260498,
"learning_rate": 0.0001,
"loss": 2.6499,
"step": 1517
},
{
"epoch": 0.9876382563435263,
"grad_norm": 0.19818106293678284,
"learning_rate": 0.0001,
"loss": 2.1229,
"step": 1518
},
{
"epoch": 0.9882888744307091,
"grad_norm": 0.2608949542045593,
"learning_rate": 0.0001,
"loss": 2.7848,
"step": 1519
},
{
"epoch": 0.988939492517892,
"grad_norm": 0.19214370846748352,
"learning_rate": 0.0001,
"loss": 2.0514,
"step": 1520
},
{
"epoch": 0.9895901106050748,
"grad_norm": 0.21454864740371704,
"learning_rate": 0.0001,
"loss": 1.8879,
"step": 1521
},
{
"epoch": 0.9902407286922577,
"grad_norm": 0.22206801176071167,
"learning_rate": 0.0001,
"loss": 2.0008,
"step": 1522
},
{
"epoch": 0.9908913467794405,
"grad_norm": 0.19270485639572144,
"learning_rate": 0.0001,
"loss": 1.9491,
"step": 1523
},
{
"epoch": 0.9915419648666233,
"grad_norm": 0.27471333742141724,
"learning_rate": 0.0001,
"loss": 2.4914,
"step": 1524
},
{
"epoch": 0.9921925829538061,
"grad_norm": 0.2767917513847351,
"learning_rate": 0.0001,
"loss": 2.3733,
"step": 1525
},
{
"epoch": 0.9928432010409889,
"grad_norm": 0.222362220287323,
"learning_rate": 0.0001,
"loss": 2.1563,
"step": 1526
},
{
"epoch": 0.9934938191281718,
"grad_norm": 0.2520142197608948,
"learning_rate": 0.0001,
"loss": 2.2877,
"step": 1527
},
{
"epoch": 0.9941444372153546,
"grad_norm": 0.20014792680740356,
"learning_rate": 0.0001,
"loss": 2.087,
"step": 1528
},
{
"epoch": 0.9947950553025374,
"grad_norm": 0.18027350306510925,
"learning_rate": 0.0001,
"loss": 1.9049,
"step": 1529
},
{
"epoch": 0.9954456733897202,
"grad_norm": 0.20437590777873993,
"learning_rate": 0.0001,
"loss": 1.9805,
"step": 1530
},
{
"epoch": 0.996096291476903,
"grad_norm": 0.38628190755844116,
"learning_rate": 0.0001,
"loss": 2.5385,
"step": 1531
},
{
"epoch": 0.9967469095640858,
"grad_norm": 0.24987295269966125,
"learning_rate": 0.0001,
"loss": 2.0762,
"step": 1532
},
{
"epoch": 0.9973975276512687,
"grad_norm": 0.2631097733974457,
"learning_rate": 0.0001,
"loss": 2.1693,
"step": 1533
},
{
"epoch": 0.9980481457384516,
"grad_norm": 0.21323037147521973,
"learning_rate": 0.0001,
"loss": 1.8547,
"step": 1534
},
{
"epoch": 0.9986987638256344,
"grad_norm": 0.19627395272254944,
"learning_rate": 0.0001,
"loss": 1.9524,
"step": 1535
},
{
"epoch": 0.9993493819128172,
"grad_norm": 0.23723964393138885,
"learning_rate": 0.0001,
"loss": 2.2301,
"step": 1536
},
{
"epoch": 1.0,
"grad_norm": 0.2651236355304718,
"learning_rate": 0.0001,
"loss": 2.3068,
"step": 1537
},
{
"epoch": 1.0,
"step": 1537,
"total_flos": 2.3185853705323807e+18,
"train_loss": 2.2698031654587485,
"train_runtime": 18789.3101,
"train_samples_per_second": 0.327,
"train_steps_per_second": 0.082
}
],
"logging_steps": 1,
"max_steps": 1537,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.3185853705323807e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}