SQLFlow-Retrival-0.6B / trainer_state.json
xccr's picture
commit model
581e209
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 4017,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007472445357743321,
"grad_norm": 7.656607564918146,
"learning_rate": 5.999999770634896e-06,
"loss": 0.880859375,
"memory(GiB)": 4.97,
"step": 1,
"train_speed(iter/s)": 0.036844
},
{
"epoch": 0.0037362226788716607,
"grad_norm": 1.3240812663129038,
"learning_rate": 5.999994265874156e-06,
"loss": 0.6390380859375,
"memory(GiB)": 5.26,
"step": 5,
"train_speed(iter/s)": 0.138065
},
{
"epoch": 0.007472445357743321,
"grad_norm": 0.7284755993919946,
"learning_rate": 5.999977063518543e-06,
"loss": 0.5041015625,
"memory(GiB)": 5.26,
"step": 10,
"train_speed(iter/s)": 0.213363
},
{
"epoch": 0.011208668036614982,
"grad_norm": 0.6580943903426335,
"learning_rate": 5.999948392998923e-06,
"loss": 0.40029296875,
"memory(GiB)": 5.26,
"step": 15,
"train_speed(iter/s)": 0.261004
},
{
"epoch": 0.014944890715486643,
"grad_norm": 0.7857583426949194,
"learning_rate": 5.999908254424895e-06,
"loss": 0.33583984375,
"memory(GiB)": 6.59,
"step": 20,
"train_speed(iter/s)": 0.293459
},
{
"epoch": 0.018681113394358302,
"grad_norm": 0.6568799760311678,
"learning_rate": 5.999856647949899e-06,
"loss": 0.29228515625,
"memory(GiB)": 6.59,
"step": 25,
"train_speed(iter/s)": 0.318359
},
{
"epoch": 0.022417336073229963,
"grad_norm": 0.6029155858817358,
"learning_rate": 5.999793573771213e-06,
"loss": 0.316943359375,
"memory(GiB)": 6.59,
"step": 30,
"train_speed(iter/s)": 0.337313
},
{
"epoch": 0.026153558752101624,
"grad_norm": 0.5753306542956114,
"learning_rate": 5.999719032129956e-06,
"loss": 0.2943359375,
"memory(GiB)": 6.59,
"step": 35,
"train_speed(iter/s)": 0.348738
},
{
"epoch": 0.029889781430973286,
"grad_norm": 0.4408689781290305,
"learning_rate": 5.999633023311079e-06,
"loss": 0.2607421875,
"memory(GiB)": 6.59,
"step": 40,
"train_speed(iter/s)": 0.361231
},
{
"epoch": 0.03362600410984495,
"grad_norm": 0.48969938792644213,
"learning_rate": 5.999535547643375e-06,
"loss": 0.274072265625,
"memory(GiB)": 6.59,
"step": 45,
"train_speed(iter/s)": 0.371707
},
{
"epoch": 0.037362226788716604,
"grad_norm": 0.44261950794528854,
"learning_rate": 5.999426605499469e-06,
"loss": 0.26669921875,
"memory(GiB)": 6.59,
"step": 50,
"train_speed(iter/s)": 0.379995
},
{
"epoch": 0.04109844946758827,
"grad_norm": 0.598637540279772,
"learning_rate": 5.999306197295818e-06,
"loss": 0.231982421875,
"memory(GiB)": 6.59,
"step": 55,
"train_speed(iter/s)": 0.387849
},
{
"epoch": 0.04483467214645993,
"grad_norm": 0.590172951597349,
"learning_rate": 5.999174323492712e-06,
"loss": 0.2291015625,
"memory(GiB)": 6.59,
"step": 60,
"train_speed(iter/s)": 0.394021
},
{
"epoch": 0.04857089482533159,
"grad_norm": 0.3946331437032985,
"learning_rate": 5.999030984594274e-06,
"loss": 0.2292236328125,
"memory(GiB)": 7.12,
"step": 65,
"train_speed(iter/s)": 0.398312
},
{
"epoch": 0.05230711750420325,
"grad_norm": 0.47975474179770955,
"learning_rate": 5.998876181148451e-06,
"loss": 0.244677734375,
"memory(GiB)": 7.12,
"step": 70,
"train_speed(iter/s)": 0.403159
},
{
"epoch": 0.05604334018307491,
"grad_norm": 0.42828328321417347,
"learning_rate": 5.99870991374702e-06,
"loss": 0.241357421875,
"memory(GiB)": 7.12,
"step": 75,
"train_speed(iter/s)": 0.40778
},
{
"epoch": 0.05977956286194657,
"grad_norm": 0.42785803136464096,
"learning_rate": 5.9985321830255785e-06,
"loss": 0.19462890625,
"memory(GiB)": 7.12,
"step": 80,
"train_speed(iter/s)": 0.411773
},
{
"epoch": 0.06351578554081823,
"grad_norm": 0.508984996055907,
"learning_rate": 5.998342989663546e-06,
"loss": 0.2152587890625,
"memory(GiB)": 7.12,
"step": 85,
"train_speed(iter/s)": 0.415122
},
{
"epoch": 0.0672520082196899,
"grad_norm": 0.40314104128835676,
"learning_rate": 5.998142334384162e-06,
"loss": 0.2130859375,
"memory(GiB)": 7.12,
"step": 90,
"train_speed(iter/s)": 0.417064
},
{
"epoch": 0.07098823089856156,
"grad_norm": 0.4958145558390914,
"learning_rate": 5.997930217954482e-06,
"loss": 0.20390625,
"memory(GiB)": 7.12,
"step": 95,
"train_speed(iter/s)": 0.419957
},
{
"epoch": 0.07472445357743321,
"grad_norm": 0.41222740097614996,
"learning_rate": 5.997706641185376e-06,
"loss": 0.2318359375,
"memory(GiB)": 7.68,
"step": 100,
"train_speed(iter/s)": 0.422692
},
{
"epoch": 0.07846067625630487,
"grad_norm": 0.3568824010450547,
"learning_rate": 5.997471604931518e-06,
"loss": 0.21181640625,
"memory(GiB)": 7.68,
"step": 105,
"train_speed(iter/s)": 0.425586
},
{
"epoch": 0.08219689893517654,
"grad_norm": 0.5279562949874639,
"learning_rate": 5.997225110091396e-06,
"loss": 0.2095947265625,
"memory(GiB)": 7.68,
"step": 110,
"train_speed(iter/s)": 0.428419
},
{
"epoch": 0.0859331216140482,
"grad_norm": 0.4919839298671231,
"learning_rate": 5.996967157607298e-06,
"loss": 0.187939453125,
"memory(GiB)": 7.68,
"step": 115,
"train_speed(iter/s)": 0.430818
},
{
"epoch": 0.08966934429291985,
"grad_norm": 0.3706866470661083,
"learning_rate": 5.99669774846531e-06,
"loss": 0.2244140625,
"memory(GiB)": 7.68,
"step": 120,
"train_speed(iter/s)": 0.432015
},
{
"epoch": 0.09340556697179152,
"grad_norm": 0.39636987044245997,
"learning_rate": 5.9964168836953194e-06,
"loss": 0.206689453125,
"memory(GiB)": 7.68,
"step": 125,
"train_speed(iter/s)": 0.434132
},
{
"epoch": 0.09714178965066318,
"grad_norm": 0.4441200958244795,
"learning_rate": 5.996124564371e-06,
"loss": 0.17958984375,
"memory(GiB)": 7.68,
"step": 130,
"train_speed(iter/s)": 0.435878
},
{
"epoch": 0.10087801232953485,
"grad_norm": 0.5703220339704642,
"learning_rate": 5.995820791609815e-06,
"loss": 0.1775390625,
"memory(GiB)": 7.68,
"step": 135,
"train_speed(iter/s)": 0.437848
},
{
"epoch": 0.1046142350084065,
"grad_norm": 0.4384590937574754,
"learning_rate": 5.995505566573013e-06,
"loss": 0.166064453125,
"memory(GiB)": 7.68,
"step": 140,
"train_speed(iter/s)": 0.438804
},
{
"epoch": 0.10835045768727816,
"grad_norm": 0.39708135180108495,
"learning_rate": 5.995178890465622e-06,
"loss": 0.1685302734375,
"memory(GiB)": 7.68,
"step": 145,
"train_speed(iter/s)": 0.440584
},
{
"epoch": 0.11208668036614983,
"grad_norm": 0.4525405723559605,
"learning_rate": 5.99484076453644e-06,
"loss": 0.19501953125,
"memory(GiB)": 7.68,
"step": 150,
"train_speed(iter/s)": 0.441918
},
{
"epoch": 0.11582290304502148,
"grad_norm": 0.285652037586189,
"learning_rate": 5.99449119007804e-06,
"loss": 0.1964111328125,
"memory(GiB)": 7.68,
"step": 155,
"train_speed(iter/s)": 0.442742
},
{
"epoch": 0.11955912572389314,
"grad_norm": 0.37436551218621555,
"learning_rate": 5.994130168426758e-06,
"loss": 0.17265625,
"memory(GiB)": 7.68,
"step": 160,
"train_speed(iter/s)": 0.444294
},
{
"epoch": 0.1232953484027648,
"grad_norm": 0.4319611112269015,
"learning_rate": 5.993757700962691e-06,
"loss": 0.1605712890625,
"memory(GiB)": 7.68,
"step": 165,
"train_speed(iter/s)": 0.445095
},
{
"epoch": 0.12703157108163646,
"grad_norm": 0.4679153709762584,
"learning_rate": 5.993373789109686e-06,
"loss": 0.165673828125,
"memory(GiB)": 7.68,
"step": 170,
"train_speed(iter/s)": 0.446127
},
{
"epoch": 0.13076779376050812,
"grad_norm": 0.371562107209469,
"learning_rate": 5.992978434335345e-06,
"loss": 0.2007080078125,
"memory(GiB)": 7.68,
"step": 175,
"train_speed(iter/s)": 0.447213
},
{
"epoch": 0.1345040164393798,
"grad_norm": 0.41362103389091964,
"learning_rate": 5.992571638151009e-06,
"loss": 0.189794921875,
"memory(GiB)": 7.68,
"step": 180,
"train_speed(iter/s)": 0.447752
},
{
"epoch": 0.13824023911825145,
"grad_norm": 0.44521680263908975,
"learning_rate": 5.992153402111759e-06,
"loss": 0.1886474609375,
"memory(GiB)": 7.68,
"step": 185,
"train_speed(iter/s)": 0.448523
},
{
"epoch": 0.14197646179712312,
"grad_norm": 0.3574382830191666,
"learning_rate": 5.991723727816408e-06,
"loss": 0.2037109375,
"memory(GiB)": 7.68,
"step": 190,
"train_speed(iter/s)": 0.449759
},
{
"epoch": 0.14571268447599478,
"grad_norm": 0.384417458292917,
"learning_rate": 5.991282616907493e-06,
"loss": 0.182666015625,
"memory(GiB)": 7.68,
"step": 195,
"train_speed(iter/s)": 0.450424
},
{
"epoch": 0.14944890715486642,
"grad_norm": 0.30564363786555343,
"learning_rate": 5.990830071071269e-06,
"loss": 0.1610107421875,
"memory(GiB)": 7.68,
"step": 200,
"train_speed(iter/s)": 0.45118
},
{
"epoch": 0.15318512983373808,
"grad_norm": 0.34594889167069637,
"learning_rate": 5.990366092037709e-06,
"loss": 0.1712890625,
"memory(GiB)": 7.68,
"step": 205,
"train_speed(iter/s)": 0.451796
},
{
"epoch": 0.15692135251260975,
"grad_norm": 0.26609760325798565,
"learning_rate": 5.9898906815804865e-06,
"loss": 0.1716552734375,
"memory(GiB)": 8.72,
"step": 210,
"train_speed(iter/s)": 0.451737
},
{
"epoch": 0.1606575751914814,
"grad_norm": 0.4326902726320289,
"learning_rate": 5.989403841516979e-06,
"loss": 0.1868408203125,
"memory(GiB)": 9.45,
"step": 215,
"train_speed(iter/s)": 0.452203
},
{
"epoch": 0.16439379787035308,
"grad_norm": 0.31305519468747833,
"learning_rate": 5.9889055737082535e-06,
"loss": 0.1808837890625,
"memory(GiB)": 9.45,
"step": 220,
"train_speed(iter/s)": 0.452465
},
{
"epoch": 0.16813002054922474,
"grad_norm": 0.337929954898332,
"learning_rate": 5.988395880059065e-06,
"loss": 0.1795166015625,
"memory(GiB)": 9.45,
"step": 225,
"train_speed(iter/s)": 0.452759
},
{
"epoch": 0.1718662432280964,
"grad_norm": 0.39047122531072104,
"learning_rate": 5.987874762517843e-06,
"loss": 0.169384765625,
"memory(GiB)": 9.45,
"step": 230,
"train_speed(iter/s)": 0.453624
},
{
"epoch": 0.17560246590696804,
"grad_norm": 0.29442955032080625,
"learning_rate": 5.987342223076692e-06,
"loss": 0.15751953125,
"memory(GiB)": 9.45,
"step": 235,
"train_speed(iter/s)": 0.453993
},
{
"epoch": 0.1793386885858397,
"grad_norm": 0.4050248335175831,
"learning_rate": 5.986798263771375e-06,
"loss": 0.1673095703125,
"memory(GiB)": 9.45,
"step": 240,
"train_speed(iter/s)": 0.454589
},
{
"epoch": 0.18307491126471137,
"grad_norm": 0.39189087307596043,
"learning_rate": 5.9862428866813155e-06,
"loss": 0.17457275390625,
"memory(GiB)": 9.45,
"step": 245,
"train_speed(iter/s)": 0.455097
},
{
"epoch": 0.18681113394358304,
"grad_norm": 0.2995268823777092,
"learning_rate": 5.985676093929579e-06,
"loss": 0.1733154296875,
"memory(GiB)": 9.45,
"step": 250,
"train_speed(iter/s)": 0.455559
},
{
"epoch": 0.1905473566224547,
"grad_norm": 0.35042188317088824,
"learning_rate": 5.985097887682876e-06,
"loss": 0.18154296875,
"memory(GiB)": 9.45,
"step": 255,
"train_speed(iter/s)": 0.456072
},
{
"epoch": 0.19428357930132636,
"grad_norm": 0.3402952343617486,
"learning_rate": 5.984508270151542e-06,
"loss": 0.1767578125,
"memory(GiB)": 9.45,
"step": 260,
"train_speed(iter/s)": 0.456723
},
{
"epoch": 0.19801980198019803,
"grad_norm": 0.2789400887911893,
"learning_rate": 5.983907243589537e-06,
"loss": 0.16141357421875,
"memory(GiB)": 9.45,
"step": 265,
"train_speed(iter/s)": 0.456772
},
{
"epoch": 0.2017560246590697,
"grad_norm": 0.33400251489865246,
"learning_rate": 5.983294810294439e-06,
"loss": 0.158544921875,
"memory(GiB)": 9.45,
"step": 270,
"train_speed(iter/s)": 0.457152
},
{
"epoch": 0.20549224733794133,
"grad_norm": 0.4225006545766808,
"learning_rate": 5.982670972607426e-06,
"loss": 0.1498046875,
"memory(GiB)": 9.45,
"step": 275,
"train_speed(iter/s)": 0.457743
},
{
"epoch": 0.209228470016813,
"grad_norm": 0.43474965051646863,
"learning_rate": 5.982035732913273e-06,
"loss": 0.1770263671875,
"memory(GiB)": 9.45,
"step": 280,
"train_speed(iter/s)": 0.457807
},
{
"epoch": 0.21296469269568466,
"grad_norm": 0.36173927443406817,
"learning_rate": 5.981389093640344e-06,
"loss": 0.1758056640625,
"memory(GiB)": 9.45,
"step": 285,
"train_speed(iter/s)": 0.458088
},
{
"epoch": 0.21670091537455632,
"grad_norm": 0.25308312315237813,
"learning_rate": 5.980731057260579e-06,
"loss": 0.173388671875,
"memory(GiB)": 9.45,
"step": 290,
"train_speed(iter/s)": 0.457498
},
{
"epoch": 0.220437138053428,
"grad_norm": 0.29470555914634394,
"learning_rate": 5.980061626289489e-06,
"loss": 0.15411376953125,
"memory(GiB)": 9.45,
"step": 295,
"train_speed(iter/s)": 0.457387
},
{
"epoch": 0.22417336073229965,
"grad_norm": 0.35624287307171026,
"learning_rate": 5.9793808032861385e-06,
"loss": 0.1614501953125,
"memory(GiB)": 9.45,
"step": 300,
"train_speed(iter/s)": 0.457895
},
{
"epoch": 0.22790958341117132,
"grad_norm": 0.2504855752959934,
"learning_rate": 5.9786885908531455e-06,
"loss": 0.15517578125,
"memory(GiB)": 9.45,
"step": 305,
"train_speed(iter/s)": 0.458265
},
{
"epoch": 0.23164580609004295,
"grad_norm": 0.33904923734016645,
"learning_rate": 5.977984991636665e-06,
"loss": 0.1745361328125,
"memory(GiB)": 9.45,
"step": 310,
"train_speed(iter/s)": 0.458658
},
{
"epoch": 0.23538202876891462,
"grad_norm": 0.3551555191841338,
"learning_rate": 5.977270008326383e-06,
"loss": 0.157275390625,
"memory(GiB)": 9.45,
"step": 315,
"train_speed(iter/s)": 0.459103
},
{
"epoch": 0.23911825144778628,
"grad_norm": 0.4587798002581139,
"learning_rate": 5.9765436436555e-06,
"loss": 0.1659423828125,
"memory(GiB)": 9.45,
"step": 320,
"train_speed(iter/s)": 0.459434
},
{
"epoch": 0.24285447412665795,
"grad_norm": 0.3505254508815674,
"learning_rate": 5.975805900400728e-06,
"loss": 0.1699951171875,
"memory(GiB)": 9.45,
"step": 325,
"train_speed(iter/s)": 0.459396
},
{
"epoch": 0.2465906968055296,
"grad_norm": 0.3234531871867349,
"learning_rate": 5.9750567813822766e-06,
"loss": 0.15689697265625,
"memory(GiB)": 9.45,
"step": 330,
"train_speed(iter/s)": 0.459815
},
{
"epoch": 0.2503269194844013,
"grad_norm": 0.2847235822528394,
"learning_rate": 5.974296289463838e-06,
"loss": 0.1782470703125,
"memory(GiB)": 9.45,
"step": 335,
"train_speed(iter/s)": 0.460005
},
{
"epoch": 0.2540631421632729,
"grad_norm": 0.19887321720781595,
"learning_rate": 5.973524427552586e-06,
"loss": 0.1454345703125,
"memory(GiB)": 9.45,
"step": 340,
"train_speed(iter/s)": 0.46045
},
{
"epoch": 0.2577993648421446,
"grad_norm": 0.35609582881164253,
"learning_rate": 5.972741198599155e-06,
"loss": 0.15576171875,
"memory(GiB)": 9.45,
"step": 345,
"train_speed(iter/s)": 0.460808
},
{
"epoch": 0.26153558752101624,
"grad_norm": 0.3260335257305967,
"learning_rate": 5.971946605597634e-06,
"loss": 0.1542236328125,
"memory(GiB)": 9.45,
"step": 350,
"train_speed(iter/s)": 0.461081
},
{
"epoch": 0.26527181019988794,
"grad_norm": 0.3000956082136632,
"learning_rate": 5.9711406515855535e-06,
"loss": 0.1672119140625,
"memory(GiB)": 9.45,
"step": 355,
"train_speed(iter/s)": 0.461632
},
{
"epoch": 0.2690080328787596,
"grad_norm": 0.5003356531721083,
"learning_rate": 5.970323339643875e-06,
"loss": 0.141943359375,
"memory(GiB)": 9.45,
"step": 360,
"train_speed(iter/s)": 0.46182
},
{
"epoch": 0.2727442555576312,
"grad_norm": 0.3898278569959764,
"learning_rate": 5.969494672896979e-06,
"loss": 0.1525146484375,
"memory(GiB)": 9.45,
"step": 365,
"train_speed(iter/s)": 0.461906
},
{
"epoch": 0.2764804782365029,
"grad_norm": 0.3453310818742678,
"learning_rate": 5.96865465451265e-06,
"loss": 0.178564453125,
"memory(GiB)": 9.45,
"step": 370,
"train_speed(iter/s)": 0.46223
},
{
"epoch": 0.28021670091537454,
"grad_norm": 0.38009861005791173,
"learning_rate": 5.9678032877020705e-06,
"loss": 0.1583251953125,
"memory(GiB)": 9.45,
"step": 375,
"train_speed(iter/s)": 0.46236
},
{
"epoch": 0.28395292359424623,
"grad_norm": 0.3337227144486021,
"learning_rate": 5.966940575719802e-06,
"loss": 0.164697265625,
"memory(GiB)": 9.45,
"step": 380,
"train_speed(iter/s)": 0.462583
},
{
"epoch": 0.28768914627311787,
"grad_norm": 0.34344615999699735,
"learning_rate": 5.966066521863778e-06,
"loss": 0.155126953125,
"memory(GiB)": 9.45,
"step": 385,
"train_speed(iter/s)": 0.462936
},
{
"epoch": 0.29142536895198956,
"grad_norm": 0.3782402092083932,
"learning_rate": 5.9651811294752885e-06,
"loss": 0.161767578125,
"memory(GiB)": 9.45,
"step": 390,
"train_speed(iter/s)": 0.463287
},
{
"epoch": 0.2951615916308612,
"grad_norm": 0.3820929493431576,
"learning_rate": 5.964284401938968e-06,
"loss": 0.1547119140625,
"memory(GiB)": 9.45,
"step": 395,
"train_speed(iter/s)": 0.463312
},
{
"epoch": 0.29889781430973283,
"grad_norm": 0.37254277787709306,
"learning_rate": 5.96337634268278e-06,
"loss": 0.1453125,
"memory(GiB)": 9.45,
"step": 400,
"train_speed(iter/s)": 0.463552
},
{
"epoch": 0.3026340369886045,
"grad_norm": 0.3771270351369902,
"learning_rate": 5.9624569551780115e-06,
"loss": 0.1693603515625,
"memory(GiB)": 9.45,
"step": 405,
"train_speed(iter/s)": 0.463665
},
{
"epoch": 0.30637025966747616,
"grad_norm": 0.3169810724128572,
"learning_rate": 5.961526242939251e-06,
"loss": 0.143310546875,
"memory(GiB)": 9.45,
"step": 410,
"train_speed(iter/s)": 0.463774
},
{
"epoch": 0.31010648234634786,
"grad_norm": 0.39276892682897285,
"learning_rate": 5.960584209524377e-06,
"loss": 0.12626953125,
"memory(GiB)": 9.45,
"step": 415,
"train_speed(iter/s)": 0.463772
},
{
"epoch": 0.3138427050252195,
"grad_norm": 0.30248041554648486,
"learning_rate": 5.95963085853455e-06,
"loss": 0.1291259765625,
"memory(GiB)": 9.45,
"step": 420,
"train_speed(iter/s)": 0.464062
},
{
"epoch": 0.3175789277040912,
"grad_norm": 0.31139734130517427,
"learning_rate": 5.958666193614194e-06,
"loss": 0.1403564453125,
"memory(GiB)": 9.45,
"step": 425,
"train_speed(iter/s)": 0.46431
},
{
"epoch": 0.3213151503829628,
"grad_norm": 0.29672071282145907,
"learning_rate": 5.95769021845098e-06,
"loss": 0.1619140625,
"memory(GiB)": 9.45,
"step": 430,
"train_speed(iter/s)": 0.464574
},
{
"epoch": 0.32505137306183446,
"grad_norm": 0.3245553447126267,
"learning_rate": 5.956702936775819e-06,
"loss": 0.149169921875,
"memory(GiB)": 9.45,
"step": 435,
"train_speed(iter/s)": 0.464656
},
{
"epoch": 0.32878759574070615,
"grad_norm": 0.37942479273965346,
"learning_rate": 5.955704352362843e-06,
"loss": 0.1540283203125,
"memory(GiB)": 9.45,
"step": 440,
"train_speed(iter/s)": 0.464866
},
{
"epoch": 0.3325238184195778,
"grad_norm": 0.4722961848658832,
"learning_rate": 5.954694469029391e-06,
"loss": 0.146875,
"memory(GiB)": 9.45,
"step": 445,
"train_speed(iter/s)": 0.46511
},
{
"epoch": 0.3362600410984495,
"grad_norm": 0.32208483256209325,
"learning_rate": 5.9536732906359936e-06,
"loss": 0.1362060546875,
"memory(GiB)": 9.45,
"step": 450,
"train_speed(iter/s)": 0.465444
},
{
"epoch": 0.3399962637773211,
"grad_norm": 0.39468565724302457,
"learning_rate": 5.952640821086362e-06,
"loss": 0.14046630859375,
"memory(GiB)": 9.45,
"step": 455,
"train_speed(iter/s)": 0.465502
},
{
"epoch": 0.3437324864561928,
"grad_norm": 0.2923449968980904,
"learning_rate": 5.951597064327371e-06,
"loss": 0.14259033203125,
"memory(GiB)": 9.45,
"step": 460,
"train_speed(iter/s)": 0.465768
},
{
"epoch": 0.34746870913506445,
"grad_norm": 0.2526312937320368,
"learning_rate": 5.95054202434904e-06,
"loss": 0.154150390625,
"memory(GiB)": 9.45,
"step": 465,
"train_speed(iter/s)": 0.465477
},
{
"epoch": 0.3512049318139361,
"grad_norm": 0.25397429668673016,
"learning_rate": 5.949475705184526e-06,
"loss": 0.145068359375,
"memory(GiB)": 9.45,
"step": 470,
"train_speed(iter/s)": 0.465793
},
{
"epoch": 0.3549411544928078,
"grad_norm": 0.2889099964297901,
"learning_rate": 5.948398110910099e-06,
"loss": 0.14326171875,
"memory(GiB)": 9.45,
"step": 475,
"train_speed(iter/s)": 0.465718
},
{
"epoch": 0.3586773771716794,
"grad_norm": 0.30650880945183995,
"learning_rate": 5.947309245645134e-06,
"loss": 0.17294921875,
"memory(GiB)": 9.45,
"step": 480,
"train_speed(iter/s)": 0.465738
},
{
"epoch": 0.3624135998505511,
"grad_norm": 0.23874814446464385,
"learning_rate": 5.946209113552092e-06,
"loss": 0.1577880859375,
"memory(GiB)": 9.45,
"step": 485,
"train_speed(iter/s)": 0.465905
},
{
"epoch": 0.36614982252942274,
"grad_norm": 0.26737529230375395,
"learning_rate": 5.945097718836503e-06,
"loss": 0.13236083984375,
"memory(GiB)": 9.45,
"step": 490,
"train_speed(iter/s)": 0.466159
},
{
"epoch": 0.36988604520829443,
"grad_norm": 0.34648783089300494,
"learning_rate": 5.9439750657469524e-06,
"loss": 0.166064453125,
"memory(GiB)": 9.45,
"step": 495,
"train_speed(iter/s)": 0.466248
},
{
"epoch": 0.37362226788716607,
"grad_norm": 0.3711374814276351,
"learning_rate": 5.942841158575061e-06,
"loss": 0.15181884765625,
"memory(GiB)": 9.45,
"step": 500,
"train_speed(iter/s)": 0.46631
},
{
"epoch": 0.37735849056603776,
"grad_norm": 0.26122017355859195,
"learning_rate": 5.941696001655475e-06,
"loss": 0.1420654296875,
"memory(GiB)": 9.45,
"step": 505,
"train_speed(iter/s)": 0.466356
},
{
"epoch": 0.3810947132449094,
"grad_norm": 0.30129945797313573,
"learning_rate": 5.940539599365843e-06,
"loss": 0.15704345703125,
"memory(GiB)": 9.45,
"step": 510,
"train_speed(iter/s)": 0.466088
},
{
"epoch": 0.38483093592378104,
"grad_norm": 0.27115019497623694,
"learning_rate": 5.939371956126803e-06,
"loss": 0.1350341796875,
"memory(GiB)": 9.45,
"step": 515,
"train_speed(iter/s)": 0.466144
},
{
"epoch": 0.38856715860265273,
"grad_norm": 0.3323988811121097,
"learning_rate": 5.938193076401964e-06,
"loss": 0.149072265625,
"memory(GiB)": 9.45,
"step": 520,
"train_speed(iter/s)": 0.466125
},
{
"epoch": 0.39230338128152437,
"grad_norm": 0.36151939711979136,
"learning_rate": 5.937002964697888e-06,
"loss": 0.13743896484375,
"memory(GiB)": 9.45,
"step": 525,
"train_speed(iter/s)": 0.466282
},
{
"epoch": 0.39603960396039606,
"grad_norm": 0.28979409508186516,
"learning_rate": 5.935801625564074e-06,
"loss": 0.15244140625,
"memory(GiB)": 9.45,
"step": 530,
"train_speed(iter/s)": 0.466375
},
{
"epoch": 0.3997758266392677,
"grad_norm": 0.3296511480431298,
"learning_rate": 5.934589063592946e-06,
"loss": 0.15579833984375,
"memory(GiB)": 9.45,
"step": 535,
"train_speed(iter/s)": 0.466122
},
{
"epoch": 0.4035120493181394,
"grad_norm": 0.20344435673525696,
"learning_rate": 5.933365283419823e-06,
"loss": 0.151953125,
"memory(GiB)": 9.45,
"step": 540,
"train_speed(iter/s)": 0.466272
},
{
"epoch": 0.407248271997011,
"grad_norm": 0.2633323321431179,
"learning_rate": 5.932130289722912e-06,
"loss": 0.15283203125,
"memory(GiB)": 9.45,
"step": 545,
"train_speed(iter/s)": 0.466106
},
{
"epoch": 0.41098449467588266,
"grad_norm": 0.3221514229815824,
"learning_rate": 5.9308840872232845e-06,
"loss": 0.16361083984375,
"memory(GiB)": 9.45,
"step": 550,
"train_speed(iter/s)": 0.466076
},
{
"epoch": 0.41472071735475435,
"grad_norm": 0.2957653300069589,
"learning_rate": 5.929626680684864e-06,
"loss": 0.1420654296875,
"memory(GiB)": 9.45,
"step": 555,
"train_speed(iter/s)": 0.466246
},
{
"epoch": 0.418456940033626,
"grad_norm": 0.27433097372254944,
"learning_rate": 5.928358074914402e-06,
"loss": 0.133544921875,
"memory(GiB)": 9.45,
"step": 560,
"train_speed(iter/s)": 0.466388
},
{
"epoch": 0.4221931627124977,
"grad_norm": 0.30811252206856754,
"learning_rate": 5.927078274761459e-06,
"loss": 0.13226318359375,
"memory(GiB)": 9.45,
"step": 565,
"train_speed(iter/s)": 0.46622
},
{
"epoch": 0.4259293853913693,
"grad_norm": 0.4343588364932629,
"learning_rate": 5.925787285118395e-06,
"loss": 0.132061767578125,
"memory(GiB)": 9.45,
"step": 570,
"train_speed(iter/s)": 0.466434
},
{
"epoch": 0.429665608070241,
"grad_norm": 0.28429865872011917,
"learning_rate": 5.9244851109203404e-06,
"loss": 0.1482177734375,
"memory(GiB)": 9.45,
"step": 575,
"train_speed(iter/s)": 0.466569
},
{
"epoch": 0.43340183074911265,
"grad_norm": 0.27884340279867387,
"learning_rate": 5.923171757145182e-06,
"loss": 0.14344482421875,
"memory(GiB)": 9.45,
"step": 580,
"train_speed(iter/s)": 0.46672
},
{
"epoch": 0.4371380534279843,
"grad_norm": 0.32622048115497765,
"learning_rate": 5.921847228813543e-06,
"loss": 0.146728515625,
"memory(GiB)": 9.45,
"step": 585,
"train_speed(iter/s)": 0.466879
},
{
"epoch": 0.440874276106856,
"grad_norm": 0.43240131198466947,
"learning_rate": 5.9205115309887666e-06,
"loss": 0.1595458984375,
"memory(GiB)": 9.45,
"step": 590,
"train_speed(iter/s)": 0.466944
},
{
"epoch": 0.4446104987857276,
"grad_norm": 0.31277968889979835,
"learning_rate": 5.919164668776891e-06,
"loss": 0.1449462890625,
"memory(GiB)": 9.45,
"step": 595,
"train_speed(iter/s)": 0.467155
},
{
"epoch": 0.4483467214645993,
"grad_norm": 0.2719906915702348,
"learning_rate": 5.917806647326636e-06,
"loss": 0.1359130859375,
"memory(GiB)": 9.45,
"step": 600,
"train_speed(iter/s)": 0.467399
},
{
"epoch": 0.45208294414347094,
"grad_norm": 0.2958357180656749,
"learning_rate": 5.9164374718293764e-06,
"loss": 0.1510498046875,
"memory(GiB)": 9.45,
"step": 605,
"train_speed(iter/s)": 0.467309
},
{
"epoch": 0.45581916682234264,
"grad_norm": 0.323840801916129,
"learning_rate": 5.91505714751913e-06,
"loss": 0.1556884765625,
"memory(GiB)": 9.45,
"step": 610,
"train_speed(iter/s)": 0.467461
},
{
"epoch": 0.4595553895012143,
"grad_norm": 0.25540153277044697,
"learning_rate": 5.913665679672533e-06,
"loss": 0.1478271484375,
"memory(GiB)": 9.45,
"step": 615,
"train_speed(iter/s)": 0.467614
},
{
"epoch": 0.4632916121800859,
"grad_norm": 0.3216539204738006,
"learning_rate": 5.912263073608819e-06,
"loss": 0.14404296875,
"memory(GiB)": 9.45,
"step": 620,
"train_speed(iter/s)": 0.46759
},
{
"epoch": 0.4670278348589576,
"grad_norm": 0.3564645954321089,
"learning_rate": 5.9108493346898014e-06,
"loss": 0.1556640625,
"memory(GiB)": 9.45,
"step": 625,
"train_speed(iter/s)": 0.467777
},
{
"epoch": 0.47076405753782924,
"grad_norm": 0.3234498866245867,
"learning_rate": 5.9094244683198514e-06,
"loss": 0.130474853515625,
"memory(GiB)": 9.45,
"step": 630,
"train_speed(iter/s)": 0.467657
},
{
"epoch": 0.47450028021670093,
"grad_norm": 0.27930245162799133,
"learning_rate": 5.907988479945878e-06,
"loss": 0.1467529296875,
"memory(GiB)": 9.45,
"step": 635,
"train_speed(iter/s)": 0.467501
},
{
"epoch": 0.47823650289557257,
"grad_norm": 0.2831117651967566,
"learning_rate": 5.906541375057305e-06,
"loss": 0.135107421875,
"memory(GiB)": 9.45,
"step": 640,
"train_speed(iter/s)": 0.467611
},
{
"epoch": 0.48197272557444426,
"grad_norm": 0.27115323391313917,
"learning_rate": 5.905083159186056e-06,
"loss": 0.128759765625,
"memory(GiB)": 9.45,
"step": 645,
"train_speed(iter/s)": 0.46749
},
{
"epoch": 0.4857089482533159,
"grad_norm": 0.32564068860731793,
"learning_rate": 5.903613837906525e-06,
"loss": 0.1319580078125,
"memory(GiB)": 9.45,
"step": 650,
"train_speed(iter/s)": 0.467686
},
{
"epoch": 0.48944517093218753,
"grad_norm": 0.2387399044673888,
"learning_rate": 5.902133416835561e-06,
"loss": 0.1338134765625,
"memory(GiB)": 9.45,
"step": 655,
"train_speed(iter/s)": 0.467441
},
{
"epoch": 0.4931813936110592,
"grad_norm": 0.24117814539801136,
"learning_rate": 5.900641901632444e-06,
"loss": 0.1324462890625,
"memory(GiB)": 9.45,
"step": 660,
"train_speed(iter/s)": 0.46725
},
{
"epoch": 0.49691761628993086,
"grad_norm": 0.2806056999325975,
"learning_rate": 5.899139297998865e-06,
"loss": 0.14583740234375,
"memory(GiB)": 9.45,
"step": 665,
"train_speed(iter/s)": 0.467303
},
{
"epoch": 0.5006538389688026,
"grad_norm": 0.3602595784462823,
"learning_rate": 5.897625611678904e-06,
"loss": 0.16168212890625,
"memory(GiB)": 9.45,
"step": 670,
"train_speed(iter/s)": 0.467465
},
{
"epoch": 0.5043900616476742,
"grad_norm": 0.2892704455949438,
"learning_rate": 5.896100848459004e-06,
"loss": 0.14654541015625,
"memory(GiB)": 9.45,
"step": 675,
"train_speed(iter/s)": 0.467417
},
{
"epoch": 0.5081262843265458,
"grad_norm": 0.30864116070274367,
"learning_rate": 5.894565014167955e-06,
"loss": 0.1387451171875,
"memory(GiB)": 9.45,
"step": 680,
"train_speed(iter/s)": 0.467388
},
{
"epoch": 0.5118625070054176,
"grad_norm": 0.23741861823114724,
"learning_rate": 5.89301811467687e-06,
"loss": 0.14443359375,
"memory(GiB)": 9.45,
"step": 685,
"train_speed(iter/s)": 0.467619
},
{
"epoch": 0.5155987296842892,
"grad_norm": 0.3704119854549676,
"learning_rate": 5.891460155899159e-06,
"loss": 0.1429931640625,
"memory(GiB)": 9.45,
"step": 690,
"train_speed(iter/s)": 0.467553
},
{
"epoch": 0.5193349523631609,
"grad_norm": 0.3371956586173727,
"learning_rate": 5.88989114379051e-06,
"loss": 0.122119140625,
"memory(GiB)": 9.45,
"step": 695,
"train_speed(iter/s)": 0.467568
},
{
"epoch": 0.5230711750420325,
"grad_norm": 0.23061580193263015,
"learning_rate": 5.888311084348865e-06,
"loss": 0.1429931640625,
"memory(GiB)": 9.45,
"step": 700,
"train_speed(iter/s)": 0.467617
},
{
"epoch": 0.5268073977209041,
"grad_norm": 0.2357495758457104,
"learning_rate": 5.886719983614396e-06,
"loss": 0.1326904296875,
"memory(GiB)": 9.45,
"step": 705,
"train_speed(iter/s)": 0.467672
},
{
"epoch": 0.5305436203997759,
"grad_norm": 0.20506003694806352,
"learning_rate": 5.885117847669485e-06,
"loss": 0.1441650390625,
"memory(GiB)": 9.45,
"step": 710,
"train_speed(iter/s)": 0.467709
},
{
"epoch": 0.5342798430786475,
"grad_norm": 0.3366909550119504,
"learning_rate": 5.883504682638699e-06,
"loss": 0.1407958984375,
"memory(GiB)": 9.45,
"step": 715,
"train_speed(iter/s)": 0.467852
},
{
"epoch": 0.5380160657575191,
"grad_norm": 0.2909847266005631,
"learning_rate": 5.881880494688763e-06,
"loss": 0.1455322265625,
"memory(GiB)": 9.45,
"step": 720,
"train_speed(iter/s)": 0.467893
},
{
"epoch": 0.5417522884363908,
"grad_norm": 0.2386052769018931,
"learning_rate": 5.880245290028545e-06,
"loss": 0.140478515625,
"memory(GiB)": 9.45,
"step": 725,
"train_speed(iter/s)": 0.467751
},
{
"epoch": 0.5454885111152624,
"grad_norm": 0.2645707847366404,
"learning_rate": 5.878599074909023e-06,
"loss": 0.1463134765625,
"memory(GiB)": 9.45,
"step": 730,
"train_speed(iter/s)": 0.467878
},
{
"epoch": 0.5492247337941342,
"grad_norm": 0.31563029908522805,
"learning_rate": 5.876941855623268e-06,
"loss": 0.1530029296875,
"memory(GiB)": 9.45,
"step": 735,
"train_speed(iter/s)": 0.467974
},
{
"epoch": 0.5529609564730058,
"grad_norm": 0.26319413448836815,
"learning_rate": 5.8752736385064145e-06,
"loss": 0.12587890625,
"memory(GiB)": 9.45,
"step": 740,
"train_speed(iter/s)": 0.467961
},
{
"epoch": 0.5566971791518774,
"grad_norm": 0.3858440978882179,
"learning_rate": 5.873594429935642e-06,
"loss": 0.1377197265625,
"memory(GiB)": 9.45,
"step": 745,
"train_speed(iter/s)": 0.468054
},
{
"epoch": 0.5604334018307491,
"grad_norm": 0.20276433188895907,
"learning_rate": 5.871904236330144e-06,
"loss": 0.12718505859375,
"memory(GiB)": 9.45,
"step": 750,
"train_speed(iter/s)": 0.468081
},
{
"epoch": 0.5641696245096208,
"grad_norm": 0.22243564217533868,
"learning_rate": 5.870203064151111e-06,
"loss": 0.1421630859375,
"memory(GiB)": 9.45,
"step": 755,
"train_speed(iter/s)": 0.468228
},
{
"epoch": 0.5679058471884925,
"grad_norm": 0.2924186985340597,
"learning_rate": 5.8684909199017e-06,
"loss": 0.145458984375,
"memory(GiB)": 9.45,
"step": 760,
"train_speed(iter/s)": 0.468279
},
{
"epoch": 0.5716420698673641,
"grad_norm": 0.22056169438669584,
"learning_rate": 5.866767810127009e-06,
"loss": 0.128564453125,
"memory(GiB)": 9.45,
"step": 765,
"train_speed(iter/s)": 0.468225
},
{
"epoch": 0.5753782925462357,
"grad_norm": 0.2740803532217515,
"learning_rate": 5.86503374141406e-06,
"loss": 0.1392822265625,
"memory(GiB)": 9.45,
"step": 770,
"train_speed(iter/s)": 0.468416
},
{
"epoch": 0.5791145152251074,
"grad_norm": 0.3606780255005757,
"learning_rate": 5.863288720391763e-06,
"loss": 0.155615234375,
"memory(GiB)": 9.45,
"step": 775,
"train_speed(iter/s)": 0.468411
},
{
"epoch": 0.5828507379039791,
"grad_norm": 0.21221894282841508,
"learning_rate": 5.861532753730898e-06,
"loss": 0.1374755859375,
"memory(GiB)": 9.45,
"step": 780,
"train_speed(iter/s)": 0.468088
},
{
"epoch": 0.5865869605828508,
"grad_norm": 0.2660755841560947,
"learning_rate": 5.859765848144089e-06,
"loss": 0.13995361328125,
"memory(GiB)": 9.45,
"step": 785,
"train_speed(iter/s)": 0.467999
},
{
"epoch": 0.5903231832617224,
"grad_norm": 0.22600558934152162,
"learning_rate": 5.857988010385774e-06,
"loss": 0.128515625,
"memory(GiB)": 9.45,
"step": 790,
"train_speed(iter/s)": 0.468097
},
{
"epoch": 0.594059405940594,
"grad_norm": 0.24882069836354315,
"learning_rate": 5.856199247252184e-06,
"loss": 0.1505126953125,
"memory(GiB)": 9.45,
"step": 795,
"train_speed(iter/s)": 0.468257
},
{
"epoch": 0.5977956286194657,
"grad_norm": 0.2541011112429318,
"learning_rate": 5.854399565581314e-06,
"loss": 0.13427734375,
"memory(GiB)": 10.57,
"step": 800,
"train_speed(iter/s)": 0.468265
},
{
"epoch": 0.6015318512983374,
"grad_norm": 0.2872004300607469,
"learning_rate": 5.8525889722528985e-06,
"loss": 0.1360595703125,
"memory(GiB)": 10.57,
"step": 805,
"train_speed(iter/s)": 0.468185
},
{
"epoch": 0.605268073977209,
"grad_norm": 0.323332001257219,
"learning_rate": 5.850767474188383e-06,
"loss": 0.1507080078125,
"memory(GiB)": 10.57,
"step": 810,
"train_speed(iter/s)": 0.468123
},
{
"epoch": 0.6090042966560807,
"grad_norm": 0.20912945195997415,
"learning_rate": 5.8489350783509025e-06,
"loss": 0.13023681640625,
"memory(GiB)": 10.57,
"step": 815,
"train_speed(iter/s)": 0.46818
},
{
"epoch": 0.6127405193349523,
"grad_norm": 0.2939854808777276,
"learning_rate": 5.847091791745247e-06,
"loss": 0.14840087890625,
"memory(GiB)": 10.57,
"step": 820,
"train_speed(iter/s)": 0.468357
},
{
"epoch": 0.6164767420138241,
"grad_norm": 0.24988955332399215,
"learning_rate": 5.8452376214178426e-06,
"loss": 0.12974853515625,
"memory(GiB)": 10.57,
"step": 825,
"train_speed(iter/s)": 0.468499
},
{
"epoch": 0.6202129646926957,
"grad_norm": 0.21781664947497836,
"learning_rate": 5.84337257445672e-06,
"loss": 0.1396484375,
"memory(GiB)": 10.57,
"step": 830,
"train_speed(iter/s)": 0.468441
},
{
"epoch": 0.6239491873715673,
"grad_norm": 0.25082480295038034,
"learning_rate": 5.841496657991487e-06,
"loss": 0.135546875,
"memory(GiB)": 10.57,
"step": 835,
"train_speed(iter/s)": 0.468446
},
{
"epoch": 0.627685410050439,
"grad_norm": 0.2686863733874229,
"learning_rate": 5.8396098791933055e-06,
"loss": 0.11251220703125,
"memory(GiB)": 10.57,
"step": 840,
"train_speed(iter/s)": 0.46852
},
{
"epoch": 0.6314216327293106,
"grad_norm": 0.2710369755059897,
"learning_rate": 5.837712245274861e-06,
"loss": 0.11365966796875,
"memory(GiB)": 10.57,
"step": 845,
"train_speed(iter/s)": 0.468573
},
{
"epoch": 0.6351578554081824,
"grad_norm": 0.34487595194525544,
"learning_rate": 5.835803763490333e-06,
"loss": 0.1312255859375,
"memory(GiB)": 10.57,
"step": 850,
"train_speed(iter/s)": 0.468679
},
{
"epoch": 0.638894078087054,
"grad_norm": 0.2557913641225529,
"learning_rate": 5.833884441135373e-06,
"loss": 0.150701904296875,
"memory(GiB)": 10.57,
"step": 855,
"train_speed(iter/s)": 0.468713
},
{
"epoch": 0.6426303007659256,
"grad_norm": 0.2492246452188681,
"learning_rate": 5.831954285547071e-06,
"loss": 0.1027587890625,
"memory(GiB)": 10.57,
"step": 860,
"train_speed(iter/s)": 0.468697
},
{
"epoch": 0.6463665234447973,
"grad_norm": 0.20962556058124304,
"learning_rate": 5.830013304103929e-06,
"loss": 0.13544921875,
"memory(GiB)": 10.57,
"step": 865,
"train_speed(iter/s)": 0.468784
},
{
"epoch": 0.6501027461236689,
"grad_norm": 0.26313981696050626,
"learning_rate": 5.828061504225837e-06,
"loss": 0.13037109375,
"memory(GiB)": 10.57,
"step": 870,
"train_speed(iter/s)": 0.468837
},
{
"epoch": 0.6538389688025407,
"grad_norm": 0.3459843916573515,
"learning_rate": 5.826098893374037e-06,
"loss": 0.1420654296875,
"memory(GiB)": 10.57,
"step": 875,
"train_speed(iter/s)": 0.468901
},
{
"epoch": 0.6575751914814123,
"grad_norm": 0.27792291827470583,
"learning_rate": 5.824125479051103e-06,
"loss": 0.13037109375,
"memory(GiB)": 10.57,
"step": 880,
"train_speed(iter/s)": 0.468895
},
{
"epoch": 0.6613114141602839,
"grad_norm": 0.2607823555214958,
"learning_rate": 5.8221412688009034e-06,
"loss": 0.135107421875,
"memory(GiB)": 10.57,
"step": 885,
"train_speed(iter/s)": 0.468908
},
{
"epoch": 0.6650476368391556,
"grad_norm": 0.2501794810871831,
"learning_rate": 5.820146270208581e-06,
"loss": 0.12391357421875,
"memory(GiB)": 10.57,
"step": 890,
"train_speed(iter/s)": 0.468941
},
{
"epoch": 0.6687838595180273,
"grad_norm": 0.2564710519025842,
"learning_rate": 5.8181404909005175e-06,
"loss": 0.14501953125,
"memory(GiB)": 10.57,
"step": 895,
"train_speed(iter/s)": 0.46911
},
{
"epoch": 0.672520082196899,
"grad_norm": 0.2715014489679807,
"learning_rate": 5.816123938544305e-06,
"loss": 0.132275390625,
"memory(GiB)": 10.57,
"step": 900,
"train_speed(iter/s)": 0.469243
},
{
"epoch": 0.6762563048757706,
"grad_norm": 0.321526690011715,
"learning_rate": 5.814096620848723e-06,
"loss": 0.14796142578125,
"memory(GiB)": 10.57,
"step": 905,
"train_speed(iter/s)": 0.469369
},
{
"epoch": 0.6799925275546422,
"grad_norm": 0.26737616140516984,
"learning_rate": 5.8120585455636975e-06,
"loss": 0.1335205078125,
"memory(GiB)": 10.57,
"step": 910,
"train_speed(iter/s)": 0.469536
},
{
"epoch": 0.6837287502335139,
"grad_norm": 0.23441075691954993,
"learning_rate": 5.8100097204802854e-06,
"loss": 0.13460693359375,
"memory(GiB)": 10.57,
"step": 915,
"train_speed(iter/s)": 0.469558
},
{
"epoch": 0.6874649729123856,
"grad_norm": 0.3004174037886124,
"learning_rate": 5.807950153430634e-06,
"loss": 0.13314208984375,
"memory(GiB)": 10.57,
"step": 920,
"train_speed(iter/s)": 0.469494
},
{
"epoch": 0.6912011955912573,
"grad_norm": 0.3511989299300596,
"learning_rate": 5.805879852287953e-06,
"loss": 0.11871337890625,
"memory(GiB)": 10.57,
"step": 925,
"train_speed(iter/s)": 0.469497
},
{
"epoch": 0.6949374182701289,
"grad_norm": 0.22941841038115351,
"learning_rate": 5.803798824966487e-06,
"loss": 0.12340087890625,
"memory(GiB)": 10.57,
"step": 930,
"train_speed(iter/s)": 0.469442
},
{
"epoch": 0.6986736409490005,
"grad_norm": 0.2296737881416939,
"learning_rate": 5.801707079421485e-06,
"loss": 0.115618896484375,
"memory(GiB)": 10.57,
"step": 935,
"train_speed(iter/s)": 0.469455
},
{
"epoch": 0.7024098636278722,
"grad_norm": 0.2832572168234479,
"learning_rate": 5.799604623649168e-06,
"loss": 0.1305908203125,
"memory(GiB)": 10.57,
"step": 940,
"train_speed(iter/s)": 0.46963
},
{
"epoch": 0.7061460863067439,
"grad_norm": 0.316216648821189,
"learning_rate": 5.7974914656867004e-06,
"loss": 0.123193359375,
"memory(GiB)": 10.57,
"step": 945,
"train_speed(iter/s)": 0.46966
},
{
"epoch": 0.7098823089856156,
"grad_norm": 0.26883224736976363,
"learning_rate": 5.795367613612158e-06,
"loss": 0.12900390625,
"memory(GiB)": 10.57,
"step": 950,
"train_speed(iter/s)": 0.469682
},
{
"epoch": 0.7136185316644872,
"grad_norm": 0.2965748829854584,
"learning_rate": 5.793233075544498e-06,
"loss": 0.11947021484375,
"memory(GiB)": 10.57,
"step": 955,
"train_speed(iter/s)": 0.469772
},
{
"epoch": 0.7173547543433588,
"grad_norm": 0.23063703167824398,
"learning_rate": 5.791087859643525e-06,
"loss": 0.15511474609375,
"memory(GiB)": 10.57,
"step": 960,
"train_speed(iter/s)": 0.469872
},
{
"epoch": 0.7210909770222306,
"grad_norm": 0.3034417815586922,
"learning_rate": 5.788931974109867e-06,
"loss": 0.1328369140625,
"memory(GiB)": 10.57,
"step": 965,
"train_speed(iter/s)": 0.469955
},
{
"epoch": 0.7248271997011022,
"grad_norm": 0.2606666501840904,
"learning_rate": 5.7867654271849355e-06,
"loss": 0.1348388671875,
"memory(GiB)": 10.57,
"step": 970,
"train_speed(iter/s)": 0.470006
},
{
"epoch": 0.7285634223799738,
"grad_norm": 0.26581107097992346,
"learning_rate": 5.7845882271508975e-06,
"loss": 0.133349609375,
"memory(GiB)": 10.57,
"step": 975,
"train_speed(iter/s)": 0.470064
},
{
"epoch": 0.7322996450588455,
"grad_norm": 0.3223256775646686,
"learning_rate": 5.7824003823306484e-06,
"loss": 0.13079833984375,
"memory(GiB)": 10.57,
"step": 980,
"train_speed(iter/s)": 0.469962
},
{
"epoch": 0.7360358677377171,
"grad_norm": 0.3011414890652826,
"learning_rate": 5.780201901087771e-06,
"loss": 0.1345947265625,
"memory(GiB)": 10.57,
"step": 985,
"train_speed(iter/s)": 0.470102
},
{
"epoch": 0.7397720904165889,
"grad_norm": 0.30876194543952196,
"learning_rate": 5.777992791826512e-06,
"loss": 0.14576416015625,
"memory(GiB)": 10.57,
"step": 990,
"train_speed(iter/s)": 0.470252
},
{
"epoch": 0.7435083130954605,
"grad_norm": 0.2584420531503668,
"learning_rate": 5.775773062991744e-06,
"loss": 0.1373291015625,
"memory(GiB)": 10.57,
"step": 995,
"train_speed(iter/s)": 0.470366
},
{
"epoch": 0.7472445357743321,
"grad_norm": 0.30020605961844676,
"learning_rate": 5.773542723068937e-06,
"loss": 0.1499267578125,
"memory(GiB)": 10.57,
"step": 1000,
"train_speed(iter/s)": 0.470476
},
{
"epoch": 0.7509807584532038,
"grad_norm": 0.3190463063150355,
"learning_rate": 5.771301780584126e-06,
"loss": 0.13701171875,
"memory(GiB)": 10.57,
"step": 1005,
"train_speed(iter/s)": 0.470413
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.27457688934495655,
"learning_rate": 5.769050244103872e-06,
"loss": 0.14490966796875,
"memory(GiB)": 10.57,
"step": 1010,
"train_speed(iter/s)": 0.470476
},
{
"epoch": 0.7584532038109472,
"grad_norm": 0.2676875855097265,
"learning_rate": 5.76678812223524e-06,
"loss": 0.1295654296875,
"memory(GiB)": 10.57,
"step": 1015,
"train_speed(iter/s)": 0.470519
},
{
"epoch": 0.7621894264898188,
"grad_norm": 0.35808053452995126,
"learning_rate": 5.764515423625757e-06,
"loss": 0.148779296875,
"memory(GiB)": 10.57,
"step": 1020,
"train_speed(iter/s)": 0.470638
},
{
"epoch": 0.7659256491686904,
"grad_norm": 0.2842904674611216,
"learning_rate": 5.762232156963381e-06,
"loss": 0.142138671875,
"memory(GiB)": 10.57,
"step": 1025,
"train_speed(iter/s)": 0.47066
},
{
"epoch": 0.7696618718475621,
"grad_norm": 0.24187187753830167,
"learning_rate": 5.759938330976473e-06,
"loss": 0.12486572265625,
"memory(GiB)": 10.57,
"step": 1030,
"train_speed(iter/s)": 0.470767
},
{
"epoch": 0.7733980945264338,
"grad_norm": 0.17249284757124964,
"learning_rate": 5.757633954433757e-06,
"loss": 0.13060302734375,
"memory(GiB)": 10.57,
"step": 1035,
"train_speed(iter/s)": 0.470832
},
{
"epoch": 0.7771343172053055,
"grad_norm": 0.2299915320848999,
"learning_rate": 5.755319036144289e-06,
"loss": 0.1218017578125,
"memory(GiB)": 10.57,
"step": 1040,
"train_speed(iter/s)": 0.470946
},
{
"epoch": 0.7808705398841771,
"grad_norm": 0.19120763093823928,
"learning_rate": 5.752993584957426e-06,
"loss": 0.11143798828125,
"memory(GiB)": 10.57,
"step": 1045,
"train_speed(iter/s)": 0.470952
},
{
"epoch": 0.7846067625630487,
"grad_norm": 0.23296209254061714,
"learning_rate": 5.750657609762787e-06,
"loss": 0.12412109375,
"memory(GiB)": 10.57,
"step": 1050,
"train_speed(iter/s)": 0.471065
},
{
"epoch": 0.7883429852419204,
"grad_norm": 0.23478728691916106,
"learning_rate": 5.74831111949022e-06,
"loss": 0.119873046875,
"memory(GiB)": 10.57,
"step": 1055,
"train_speed(iter/s)": 0.471177
},
{
"epoch": 0.7920792079207921,
"grad_norm": 0.3162727585321945,
"learning_rate": 5.745954123109776e-06,
"loss": 0.1431884765625,
"memory(GiB)": 10.57,
"step": 1060,
"train_speed(iter/s)": 0.471178
},
{
"epoch": 0.7958154305996638,
"grad_norm": 0.2554843268036968,
"learning_rate": 5.743586629631663e-06,
"loss": 0.13331298828125,
"memory(GiB)": 10.57,
"step": 1065,
"train_speed(iter/s)": 0.471177
},
{
"epoch": 0.7995516532785354,
"grad_norm": 0.26771118158424334,
"learning_rate": 5.741208648106216e-06,
"loss": 0.12591552734375,
"memory(GiB)": 10.57,
"step": 1070,
"train_speed(iter/s)": 0.471319
},
{
"epoch": 0.803287875957407,
"grad_norm": 0.22210847866208316,
"learning_rate": 5.7388201876238665e-06,
"loss": 0.13048095703125,
"memory(GiB)": 10.57,
"step": 1075,
"train_speed(iter/s)": 0.471098
},
{
"epoch": 0.8070240986362788,
"grad_norm": 0.2090698541733704,
"learning_rate": 5.736421257315105e-06,
"loss": 0.128466796875,
"memory(GiB)": 10.57,
"step": 1080,
"train_speed(iter/s)": 0.471054
},
{
"epoch": 0.8107603213151504,
"grad_norm": 0.25643857813943166,
"learning_rate": 5.734011866350441e-06,
"loss": 0.1285400390625,
"memory(GiB)": 10.57,
"step": 1085,
"train_speed(iter/s)": 0.471162
},
{
"epoch": 0.814496543994022,
"grad_norm": 0.21115359067598077,
"learning_rate": 5.731592023940377e-06,
"loss": 0.1183837890625,
"memory(GiB)": 10.57,
"step": 1090,
"train_speed(iter/s)": 0.471227
},
{
"epoch": 0.8182327666728937,
"grad_norm": 0.2712675993739738,
"learning_rate": 5.7291617393353644e-06,
"loss": 0.13204345703125,
"memory(GiB)": 10.57,
"step": 1095,
"train_speed(iter/s)": 0.471252
},
{
"epoch": 0.8219689893517653,
"grad_norm": 0.2084250099258058,
"learning_rate": 5.726721021825778e-06,
"loss": 0.11478271484375,
"memory(GiB)": 10.57,
"step": 1100,
"train_speed(iter/s)": 0.471379
},
{
"epoch": 0.8257052120306371,
"grad_norm": 0.2830458697450999,
"learning_rate": 5.724269880741871e-06,
"loss": 0.126416015625,
"memory(GiB)": 10.57,
"step": 1105,
"train_speed(iter/s)": 0.471405
},
{
"epoch": 0.8294414347095087,
"grad_norm": 0.3346887940418336,
"learning_rate": 5.721808325453744e-06,
"loss": 0.120458984375,
"memory(GiB)": 10.57,
"step": 1110,
"train_speed(iter/s)": 0.471343
},
{
"epoch": 0.8331776573883803,
"grad_norm": 0.36391607206986826,
"learning_rate": 5.719336365371309e-06,
"loss": 0.12493896484375,
"memory(GiB)": 10.57,
"step": 1115,
"train_speed(iter/s)": 0.471475
},
{
"epoch": 0.836913880067252,
"grad_norm": 0.24337137317648888,
"learning_rate": 5.716854009944253e-06,
"loss": 0.116259765625,
"memory(GiB)": 10.57,
"step": 1120,
"train_speed(iter/s)": 0.471472
},
{
"epoch": 0.8406501027461236,
"grad_norm": 0.26926385581419715,
"learning_rate": 5.714361268662001e-06,
"loss": 0.12049560546875,
"memory(GiB)": 10.57,
"step": 1125,
"train_speed(iter/s)": 0.471483
},
{
"epoch": 0.8443863254249954,
"grad_norm": 0.21579011337181153,
"learning_rate": 5.711858151053681e-06,
"loss": 0.13843994140625,
"memory(GiB)": 10.57,
"step": 1130,
"train_speed(iter/s)": 0.471564
},
{
"epoch": 0.848122548103867,
"grad_norm": 0.20161584440361865,
"learning_rate": 5.7093446666880895e-06,
"loss": 0.109716796875,
"memory(GiB)": 10.57,
"step": 1135,
"train_speed(iter/s)": 0.471594
},
{
"epoch": 0.8518587707827386,
"grad_norm": 0.188476182825962,
"learning_rate": 5.7068208251736475e-06,
"loss": 0.126171875,
"memory(GiB)": 10.57,
"step": 1140,
"train_speed(iter/s)": 0.471689
},
{
"epoch": 0.8555949934616103,
"grad_norm": 0.27427824113320737,
"learning_rate": 5.704286636158373e-06,
"loss": 0.12137451171875,
"memory(GiB)": 10.57,
"step": 1145,
"train_speed(iter/s)": 0.471734
},
{
"epoch": 0.859331216140482,
"grad_norm": 0.22763746167838253,
"learning_rate": 5.701742109329838e-06,
"loss": 0.13856201171875,
"memory(GiB)": 10.57,
"step": 1150,
"train_speed(iter/s)": 0.471726
},
{
"epoch": 0.8630674388193537,
"grad_norm": 0.17131970459498547,
"learning_rate": 5.6991872544151335e-06,
"loss": 0.14425048828125,
"memory(GiB)": 10.57,
"step": 1155,
"train_speed(iter/s)": 0.471788
},
{
"epoch": 0.8668036614982253,
"grad_norm": 0.23048712465348178,
"learning_rate": 5.696622081180834e-06,
"loss": 0.153955078125,
"memory(GiB)": 10.57,
"step": 1160,
"train_speed(iter/s)": 0.471828
},
{
"epoch": 0.8705398841770969,
"grad_norm": 0.2737430014057503,
"learning_rate": 5.694046599432956e-06,
"loss": 0.116259765625,
"memory(GiB)": 10.57,
"step": 1165,
"train_speed(iter/s)": 0.471811
},
{
"epoch": 0.8742761068559686,
"grad_norm": 0.23626021988375195,
"learning_rate": 5.691460819016923e-06,
"loss": 0.1245849609375,
"memory(GiB)": 10.57,
"step": 1170,
"train_speed(iter/s)": 0.471906
},
{
"epoch": 0.8780123295348403,
"grad_norm": 0.27390563050373423,
"learning_rate": 5.68886474981753e-06,
"loss": 0.12216796875,
"memory(GiB)": 10.57,
"step": 1175,
"train_speed(iter/s)": 0.471894
},
{
"epoch": 0.881748552213712,
"grad_norm": 0.3598824701234181,
"learning_rate": 5.686258401758901e-06,
"loss": 0.1288818359375,
"memory(GiB)": 10.57,
"step": 1180,
"train_speed(iter/s)": 0.471866
},
{
"epoch": 0.8854847748925836,
"grad_norm": 0.2803403042160743,
"learning_rate": 5.683641784804454e-06,
"loss": 0.119970703125,
"memory(GiB)": 10.57,
"step": 1185,
"train_speed(iter/s)": 0.471838
},
{
"epoch": 0.8892209975714552,
"grad_norm": 0.24011469363238191,
"learning_rate": 5.681014908956866e-06,
"loss": 0.12734375,
"memory(GiB)": 10.57,
"step": 1190,
"train_speed(iter/s)": 0.471876
},
{
"epoch": 0.8929572202503269,
"grad_norm": 0.23680884380834868,
"learning_rate": 5.6783777842580245e-06,
"loss": 0.131884765625,
"memory(GiB)": 10.57,
"step": 1195,
"train_speed(iter/s)": 0.471946
},
{
"epoch": 0.8966934429291986,
"grad_norm": 0.25067555757294774,
"learning_rate": 5.6757304207890006e-06,
"loss": 0.11749267578125,
"memory(GiB)": 10.57,
"step": 1200,
"train_speed(iter/s)": 0.471919
},
{
"epoch": 0.9004296656080703,
"grad_norm": 0.25663340180554484,
"learning_rate": 5.673072828670005e-06,
"loss": 0.1390380859375,
"memory(GiB)": 10.57,
"step": 1205,
"train_speed(iter/s)": 0.471961
},
{
"epoch": 0.9041658882869419,
"grad_norm": 0.34196712108358773,
"learning_rate": 5.670405018060349e-06,
"loss": 0.1314453125,
"memory(GiB)": 10.57,
"step": 1210,
"train_speed(iter/s)": 0.472017
},
{
"epoch": 0.9079021109658135,
"grad_norm": 0.25320961648503115,
"learning_rate": 5.667726999158408e-06,
"loss": 0.11199951171875,
"memory(GiB)": 10.57,
"step": 1215,
"train_speed(iter/s)": 0.472063
},
{
"epoch": 0.9116383336446853,
"grad_norm": 0.2895202800969726,
"learning_rate": 5.665038782201579e-06,
"loss": 0.11494140625,
"memory(GiB)": 10.57,
"step": 1220,
"train_speed(iter/s)": 0.472149
},
{
"epoch": 0.9153745563235569,
"grad_norm": 0.24353976208363304,
"learning_rate": 5.662340377466246e-06,
"loss": 0.13350830078125,
"memory(GiB)": 10.57,
"step": 1225,
"train_speed(iter/s)": 0.472205
},
{
"epoch": 0.9191107790024285,
"grad_norm": 0.21343931443362257,
"learning_rate": 5.659631795267736e-06,
"loss": 0.1358642578125,
"memory(GiB)": 10.57,
"step": 1230,
"train_speed(iter/s)": 0.472292
},
{
"epoch": 0.9228470016813002,
"grad_norm": 0.18836231763075187,
"learning_rate": 5.656913045960284e-06,
"loss": 0.1275634765625,
"memory(GiB)": 10.57,
"step": 1235,
"train_speed(iter/s)": 0.472331
},
{
"epoch": 0.9265832243601718,
"grad_norm": 0.4626722838861778,
"learning_rate": 5.65418413993699e-06,
"loss": 0.12288818359375,
"memory(GiB)": 10.57,
"step": 1240,
"train_speed(iter/s)": 0.472384
},
{
"epoch": 0.9303194470390436,
"grad_norm": 0.2791944611984056,
"learning_rate": 5.651445087629781e-06,
"loss": 0.12313232421875,
"memory(GiB)": 10.57,
"step": 1245,
"train_speed(iter/s)": 0.47236
},
{
"epoch": 0.9340556697179152,
"grad_norm": 0.2351927769190445,
"learning_rate": 5.648695899509373e-06,
"loss": 0.12640380859375,
"memory(GiB)": 10.57,
"step": 1250,
"train_speed(iter/s)": 0.472318
},
{
"epoch": 0.9377918923967868,
"grad_norm": 0.29167608891344404,
"learning_rate": 5.6459365860852225e-06,
"loss": 0.1332763671875,
"memory(GiB)": 10.57,
"step": 1255,
"train_speed(iter/s)": 0.472324
},
{
"epoch": 0.9415281150756585,
"grad_norm": 0.3389174699822604,
"learning_rate": 5.643167157905499e-06,
"loss": 0.1290771484375,
"memory(GiB)": 10.57,
"step": 1260,
"train_speed(iter/s)": 0.472422
},
{
"epoch": 0.9452643377545301,
"grad_norm": 0.19240685493137236,
"learning_rate": 5.640387625557036e-06,
"loss": 0.11680908203125,
"memory(GiB)": 10.57,
"step": 1265,
"train_speed(iter/s)": 0.472422
},
{
"epoch": 0.9490005604334019,
"grad_norm": 0.26444195491643885,
"learning_rate": 5.63759799966529e-06,
"loss": 0.139111328125,
"memory(GiB)": 10.57,
"step": 1270,
"train_speed(iter/s)": 0.4725
},
{
"epoch": 0.9527367831122735,
"grad_norm": 0.2630005422058253,
"learning_rate": 5.634798290894306e-06,
"loss": 0.1197265625,
"memory(GiB)": 10.57,
"step": 1275,
"train_speed(iter/s)": 0.472495
},
{
"epoch": 0.9564730057911451,
"grad_norm": 0.23145820253558871,
"learning_rate": 5.631988509946674e-06,
"loss": 0.1113037109375,
"memory(GiB)": 10.57,
"step": 1280,
"train_speed(iter/s)": 0.472494
},
{
"epoch": 0.9602092284700168,
"grad_norm": 0.2899148756938717,
"learning_rate": 5.629168667563484e-06,
"loss": 0.121484375,
"memory(GiB)": 10.57,
"step": 1285,
"train_speed(iter/s)": 0.472497
},
{
"epoch": 0.9639454511488885,
"grad_norm": 0.36548878879119173,
"learning_rate": 5.62633877452429e-06,
"loss": 0.12415771484375,
"memory(GiB)": 10.57,
"step": 1290,
"train_speed(iter/s)": 0.472442
},
{
"epoch": 0.9676816738277602,
"grad_norm": 0.25528341367700647,
"learning_rate": 5.623498841647067e-06,
"loss": 0.1307861328125,
"memory(GiB)": 10.57,
"step": 1295,
"train_speed(iter/s)": 0.472498
},
{
"epoch": 0.9714178965066318,
"grad_norm": 0.1948509154997499,
"learning_rate": 5.620648879788172e-06,
"loss": 0.122802734375,
"memory(GiB)": 10.57,
"step": 1300,
"train_speed(iter/s)": 0.47247
},
{
"epoch": 0.9751541191855034,
"grad_norm": 0.23395818708390523,
"learning_rate": 5.617788899842296e-06,
"loss": 0.1336181640625,
"memory(GiB)": 10.57,
"step": 1305,
"train_speed(iter/s)": 0.472478
},
{
"epoch": 0.9788903418643751,
"grad_norm": 0.22129751131979117,
"learning_rate": 5.61491891274243e-06,
"loss": 0.11290283203125,
"memory(GiB)": 10.57,
"step": 1310,
"train_speed(iter/s)": 0.47253
},
{
"epoch": 0.9826265645432468,
"grad_norm": 0.38768444008771463,
"learning_rate": 5.6120389294598185e-06,
"loss": 0.1374267578125,
"memory(GiB)": 10.57,
"step": 1315,
"train_speed(iter/s)": 0.472627
},
{
"epoch": 0.9863627872221185,
"grad_norm": 0.2634727672178905,
"learning_rate": 5.609148961003919e-06,
"loss": 0.10865478515625,
"memory(GiB)": 10.57,
"step": 1320,
"train_speed(iter/s)": 0.472642
},
{
"epoch": 0.9900990099009901,
"grad_norm": 0.2693250349909997,
"learning_rate": 5.606249018422361e-06,
"loss": 0.121435546875,
"memory(GiB)": 10.57,
"step": 1325,
"train_speed(iter/s)": 0.472693
},
{
"epoch": 0.9938352325798617,
"grad_norm": 0.3142833629244817,
"learning_rate": 5.603339112800902e-06,
"loss": 0.127587890625,
"memory(GiB)": 10.57,
"step": 1330,
"train_speed(iter/s)": 0.472723
},
{
"epoch": 0.9975714552587335,
"grad_norm": 0.260475289320075,
"learning_rate": 5.600419255263382e-06,
"loss": 0.12655029296875,
"memory(GiB)": 10.57,
"step": 1335,
"train_speed(iter/s)": 0.472744
},
{
"epoch": 1.0007472445357743,
"grad_norm": 0.1995569301298896,
"learning_rate": 5.5974894569716925e-06,
"loss": 0.11612548828125,
"memory(GiB)": 10.57,
"step": 1340,
"train_speed(iter/s)": 0.472443
},
{
"epoch": 1.004483467214646,
"grad_norm": 0.19232697366661053,
"learning_rate": 5.594549729125718e-06,
"loss": 0.09854736328125,
"memory(GiB)": 10.57,
"step": 1345,
"train_speed(iter/s)": 0.472505
},
{
"epoch": 1.0082196898935176,
"grad_norm": 0.33732827706175905,
"learning_rate": 5.591600082963308e-06,
"loss": 0.115234375,
"memory(GiB)": 10.57,
"step": 1350,
"train_speed(iter/s)": 0.472424
},
{
"epoch": 1.0119559125723894,
"grad_norm": 0.18176928497159583,
"learning_rate": 5.58864052976022e-06,
"loss": 0.10379638671875,
"memory(GiB)": 10.57,
"step": 1355,
"train_speed(iter/s)": 0.472477
},
{
"epoch": 1.015692135251261,
"grad_norm": 0.357592464764428,
"learning_rate": 5.585671080830091e-06,
"loss": 0.1013671875,
"memory(GiB)": 10.57,
"step": 1360,
"train_speed(iter/s)": 0.472527
},
{
"epoch": 1.0194283579301326,
"grad_norm": 0.2808114053275493,
"learning_rate": 5.5826917475243834e-06,
"loss": 0.097698974609375,
"memory(GiB)": 10.57,
"step": 1365,
"train_speed(iter/s)": 0.47262
},
{
"epoch": 1.0231645806090044,
"grad_norm": 0.17182129292938358,
"learning_rate": 5.579702541232344e-06,
"loss": 0.10177001953125,
"memory(GiB)": 10.57,
"step": 1370,
"train_speed(iter/s)": 0.472515
},
{
"epoch": 1.026900803287876,
"grad_norm": 0.36516118234394773,
"learning_rate": 5.576703473380963e-06,
"loss": 0.10394287109375,
"memory(GiB)": 10.57,
"step": 1375,
"train_speed(iter/s)": 0.472485
},
{
"epoch": 1.0306370259667477,
"grad_norm": 0.12994331570229598,
"learning_rate": 5.573694555434929e-06,
"loss": 0.09647216796875,
"memory(GiB)": 10.57,
"step": 1380,
"train_speed(iter/s)": 0.472423
},
{
"epoch": 1.0343732486456192,
"grad_norm": 0.2439500781039156,
"learning_rate": 5.570675798896584e-06,
"loss": 0.09913330078125,
"memory(GiB)": 10.57,
"step": 1385,
"train_speed(iter/s)": 0.472472
},
{
"epoch": 1.038109471324491,
"grad_norm": 0.24510427147093836,
"learning_rate": 5.567647215305884e-06,
"loss": 0.10660400390625,
"memory(GiB)": 10.57,
"step": 1390,
"train_speed(iter/s)": 0.472502
},
{
"epoch": 1.0418456940033627,
"grad_norm": 0.3224514432487436,
"learning_rate": 5.564608816240345e-06,
"loss": 0.1132080078125,
"memory(GiB)": 10.57,
"step": 1395,
"train_speed(iter/s)": 0.472569
},
{
"epoch": 1.0455819166822342,
"grad_norm": 0.23587230778852436,
"learning_rate": 5.56156061331501e-06,
"loss": 0.0916259765625,
"memory(GiB)": 10.57,
"step": 1400,
"train_speed(iter/s)": 0.472605
},
{
"epoch": 1.049318139361106,
"grad_norm": 0.22597459572374368,
"learning_rate": 5.5585026181823994e-06,
"loss": 0.10594482421875,
"memory(GiB)": 10.57,
"step": 1405,
"train_speed(iter/s)": 0.472556
},
{
"epoch": 1.0530543620399775,
"grad_norm": 0.19096269961906193,
"learning_rate": 5.555434842532465e-06,
"loss": 0.089910888671875,
"memory(GiB)": 10.57,
"step": 1410,
"train_speed(iter/s)": 0.472594
},
{
"epoch": 1.0567905847188492,
"grad_norm": 0.21611547990188876,
"learning_rate": 5.552357298092549e-06,
"loss": 0.10777587890625,
"memory(GiB)": 10.57,
"step": 1415,
"train_speed(iter/s)": 0.472614
},
{
"epoch": 1.060526807397721,
"grad_norm": 0.2651855509481471,
"learning_rate": 5.549269996627335e-06,
"loss": 0.104296875,
"memory(GiB)": 10.57,
"step": 1420,
"train_speed(iter/s)": 0.472711
},
{
"epoch": 1.0642630300765925,
"grad_norm": 0.2884021435709037,
"learning_rate": 5.546172949938806e-06,
"loss": 0.09815673828125,
"memory(GiB)": 10.57,
"step": 1425,
"train_speed(iter/s)": 0.472743
},
{
"epoch": 1.0679992527554643,
"grad_norm": 0.3272777127266579,
"learning_rate": 5.5430661698661995e-06,
"loss": 0.09771728515625,
"memory(GiB)": 10.57,
"step": 1430,
"train_speed(iter/s)": 0.472793
},
{
"epoch": 1.0717354754343358,
"grad_norm": 0.22908749879031715,
"learning_rate": 5.539949668285962e-06,
"loss": 0.11275634765625,
"memory(GiB)": 10.57,
"step": 1435,
"train_speed(iter/s)": 0.472759
},
{
"epoch": 1.0754716981132075,
"grad_norm": 0.20839037146203993,
"learning_rate": 5.5368234571117e-06,
"loss": 0.1127685546875,
"memory(GiB)": 10.57,
"step": 1440,
"train_speed(iter/s)": 0.472726
},
{
"epoch": 1.0792079207920793,
"grad_norm": 0.21433788637796058,
"learning_rate": 5.533687548294139e-06,
"loss": 0.102685546875,
"memory(GiB)": 10.57,
"step": 1445,
"train_speed(iter/s)": 0.472795
},
{
"epoch": 1.0829441434709508,
"grad_norm": 0.2194852609411041,
"learning_rate": 5.530541953821078e-06,
"loss": 0.1194580078125,
"memory(GiB)": 10.57,
"step": 1450,
"train_speed(iter/s)": 0.472644
},
{
"epoch": 1.0866803661498226,
"grad_norm": 0.2119142735733801,
"learning_rate": 5.5273866857173375e-06,
"loss": 0.09979248046875,
"memory(GiB)": 10.57,
"step": 1455,
"train_speed(iter/s)": 0.472692
},
{
"epoch": 1.090416588828694,
"grad_norm": 0.18271859704191354,
"learning_rate": 5.524221756044723e-06,
"loss": 0.10120849609375,
"memory(GiB)": 10.57,
"step": 1460,
"train_speed(iter/s)": 0.472786
},
{
"epoch": 1.0941528115075658,
"grad_norm": 0.1965749879154183,
"learning_rate": 5.521047176901968e-06,
"loss": 0.09178466796875,
"memory(GiB)": 10.57,
"step": 1465,
"train_speed(iter/s)": 0.472753
},
{
"epoch": 1.0978890341864376,
"grad_norm": 0.3529079661879815,
"learning_rate": 5.5178629604247e-06,
"loss": 0.099200439453125,
"memory(GiB)": 10.57,
"step": 1470,
"train_speed(iter/s)": 0.4728
},
{
"epoch": 1.101625256865309,
"grad_norm": 0.23509583771318013,
"learning_rate": 5.514669118785383e-06,
"loss": 0.10716552734375,
"memory(GiB)": 10.57,
"step": 1475,
"train_speed(iter/s)": 0.47282
},
{
"epoch": 1.1053614795441808,
"grad_norm": 0.22191044730282325,
"learning_rate": 5.511465664193278e-06,
"loss": 0.1013671875,
"memory(GiB)": 10.57,
"step": 1480,
"train_speed(iter/s)": 0.472823
},
{
"epoch": 1.1090977022230526,
"grad_norm": 0.2697952102289562,
"learning_rate": 5.50825260889439e-06,
"loss": 0.110107421875,
"memory(GiB)": 10.57,
"step": 1485,
"train_speed(iter/s)": 0.47279
},
{
"epoch": 1.1128339249019241,
"grad_norm": 0.34041111425327863,
"learning_rate": 5.505029965171431e-06,
"loss": 0.10975341796875,
"memory(GiB)": 10.57,
"step": 1490,
"train_speed(iter/s)": 0.472756
},
{
"epoch": 1.1165701475807959,
"grad_norm": 0.26883268648527414,
"learning_rate": 5.501797745343762e-06,
"loss": 0.09005126953125,
"memory(GiB)": 10.57,
"step": 1495,
"train_speed(iter/s)": 0.472749
},
{
"epoch": 1.1203063702596674,
"grad_norm": 0.2591995651189346,
"learning_rate": 5.498555961767353e-06,
"loss": 0.1026611328125,
"memory(GiB)": 10.57,
"step": 1500,
"train_speed(iter/s)": 0.472741
},
{
"epoch": 1.1240425929385391,
"grad_norm": 0.20112516368334774,
"learning_rate": 5.495304626834737e-06,
"loss": 0.10999755859375,
"memory(GiB)": 10.57,
"step": 1505,
"train_speed(iter/s)": 0.472822
},
{
"epoch": 1.127778815617411,
"grad_norm": 0.31243387047085314,
"learning_rate": 5.492043752974954e-06,
"loss": 0.1138671875,
"memory(GiB)": 10.57,
"step": 1510,
"train_speed(iter/s)": 0.472881
},
{
"epoch": 1.1315150382962824,
"grad_norm": 0.2885339464617385,
"learning_rate": 5.488773352653511e-06,
"loss": 0.103564453125,
"memory(GiB)": 10.57,
"step": 1515,
"train_speed(iter/s)": 0.47291
},
{
"epoch": 1.1352512609751542,
"grad_norm": 0.2769596342571021,
"learning_rate": 5.485493438372334e-06,
"loss": 0.11546630859375,
"memory(GiB)": 10.57,
"step": 1520,
"train_speed(iter/s)": 0.472812
},
{
"epoch": 1.1389874836540257,
"grad_norm": 0.3103063506129397,
"learning_rate": 5.482204022669716e-06,
"loss": 0.108837890625,
"memory(GiB)": 10.57,
"step": 1525,
"train_speed(iter/s)": 0.472794
},
{
"epoch": 1.1427237063328974,
"grad_norm": 0.2913972666378632,
"learning_rate": 5.478905118120274e-06,
"loss": 0.1053466796875,
"memory(GiB)": 10.57,
"step": 1530,
"train_speed(iter/s)": 0.472759
},
{
"epoch": 1.1464599290117692,
"grad_norm": 0.1805358057399443,
"learning_rate": 5.475596737334896e-06,
"loss": 0.10556640625,
"memory(GiB)": 10.57,
"step": 1535,
"train_speed(iter/s)": 0.472764
},
{
"epoch": 1.1501961516906407,
"grad_norm": 0.18027571971615952,
"learning_rate": 5.472278892960697e-06,
"loss": 0.10286865234375,
"memory(GiB)": 10.57,
"step": 1540,
"train_speed(iter/s)": 0.472798
},
{
"epoch": 1.1539323743695125,
"grad_norm": 0.26360451822838044,
"learning_rate": 5.468951597680969e-06,
"loss": 0.10518798828125,
"memory(GiB)": 10.57,
"step": 1545,
"train_speed(iter/s)": 0.472811
},
{
"epoch": 1.1576685970483842,
"grad_norm": 0.27825790440429315,
"learning_rate": 5.4656148642151315e-06,
"loss": 0.1068115234375,
"memory(GiB)": 10.57,
"step": 1550,
"train_speed(iter/s)": 0.472822
},
{
"epoch": 1.1614048197272557,
"grad_norm": 0.37841928985976586,
"learning_rate": 5.462268705318685e-06,
"loss": 0.105902099609375,
"memory(GiB)": 10.57,
"step": 1555,
"train_speed(iter/s)": 0.472796
},
{
"epoch": 1.1651410424061275,
"grad_norm": 0.20072772545318748,
"learning_rate": 5.458913133783158e-06,
"loss": 0.096240234375,
"memory(GiB)": 10.57,
"step": 1560,
"train_speed(iter/s)": 0.472849
},
{
"epoch": 1.168877265084999,
"grad_norm": 0.2782224873229787,
"learning_rate": 5.455548162436066e-06,
"loss": 0.10538330078125,
"memory(GiB)": 10.57,
"step": 1565,
"train_speed(iter/s)": 0.472866
},
{
"epoch": 1.1726134877638708,
"grad_norm": 0.2611062382021719,
"learning_rate": 5.4521738041408535e-06,
"loss": 0.102545166015625,
"memory(GiB)": 10.57,
"step": 1570,
"train_speed(iter/s)": 0.472925
},
{
"epoch": 1.1763497104427425,
"grad_norm": 0.36681796736657335,
"learning_rate": 5.448790071796851e-06,
"loss": 0.10877685546875,
"memory(GiB)": 10.57,
"step": 1575,
"train_speed(iter/s)": 0.472947
},
{
"epoch": 1.180085933121614,
"grad_norm": 0.31412178351944464,
"learning_rate": 5.445396978339223e-06,
"loss": 0.1108642578125,
"memory(GiB)": 10.57,
"step": 1580,
"train_speed(iter/s)": 0.472961
},
{
"epoch": 1.1838221558004858,
"grad_norm": 0.2945745909309181,
"learning_rate": 5.4419945367389204e-06,
"loss": 0.104638671875,
"memory(GiB)": 10.57,
"step": 1585,
"train_speed(iter/s)": 0.472997
},
{
"epoch": 1.1875583784793573,
"grad_norm": 0.2005694453013891,
"learning_rate": 5.438582760002628e-06,
"loss": 0.11466064453125,
"memory(GiB)": 10.57,
"step": 1590,
"train_speed(iter/s)": 0.472995
},
{
"epoch": 1.191294601158229,
"grad_norm": 0.21815942040257993,
"learning_rate": 5.4351616611727174e-06,
"loss": 0.09090576171875,
"memory(GiB)": 10.57,
"step": 1595,
"train_speed(iter/s)": 0.472983
},
{
"epoch": 1.1950308238371008,
"grad_norm": 0.3254149926280658,
"learning_rate": 5.431731253327197e-06,
"loss": 0.09832763671875,
"memory(GiB)": 10.57,
"step": 1600,
"train_speed(iter/s)": 0.472989
},
{
"epoch": 1.1987670465159723,
"grad_norm": 0.21539039093948628,
"learning_rate": 5.428291549579658e-06,
"loss": 0.0917236328125,
"memory(GiB)": 10.57,
"step": 1605,
"train_speed(iter/s)": 0.472999
},
{
"epoch": 1.202503269194844,
"grad_norm": 0.3980763574441828,
"learning_rate": 5.424842563079231e-06,
"loss": 0.1013427734375,
"memory(GiB)": 10.57,
"step": 1610,
"train_speed(iter/s)": 0.473002
},
{
"epoch": 1.2062394918737156,
"grad_norm": 0.2562644399270751,
"learning_rate": 5.421384307010532e-06,
"loss": 0.12611083984375,
"memory(GiB)": 10.57,
"step": 1615,
"train_speed(iter/s)": 0.473001
},
{
"epoch": 1.2099757145525873,
"grad_norm": 0.21063963603050906,
"learning_rate": 5.41791679459361e-06,
"loss": 0.09677734375,
"memory(GiB)": 10.57,
"step": 1620,
"train_speed(iter/s)": 0.473087
},
{
"epoch": 1.213711937231459,
"grad_norm": 0.26589295201735347,
"learning_rate": 5.4144400390839014e-06,
"loss": 0.10716552734375,
"memory(GiB)": 10.57,
"step": 1625,
"train_speed(iter/s)": 0.473137
},
{
"epoch": 1.2174481599103306,
"grad_norm": 0.3159674300444183,
"learning_rate": 5.410954053772174e-06,
"loss": 0.117822265625,
"memory(GiB)": 10.57,
"step": 1630,
"train_speed(iter/s)": 0.473161
},
{
"epoch": 1.2211843825892024,
"grad_norm": 0.3257909348870682,
"learning_rate": 5.407458851984481e-06,
"loss": 0.105908203125,
"memory(GiB)": 10.57,
"step": 1635,
"train_speed(iter/s)": 0.473064
},
{
"epoch": 1.224920605268074,
"grad_norm": 0.25594963311057084,
"learning_rate": 5.403954447082107e-06,
"loss": 0.1008544921875,
"memory(GiB)": 10.57,
"step": 1640,
"train_speed(iter/s)": 0.473138
},
{
"epoch": 1.2286568279469456,
"grad_norm": 0.27760936809640124,
"learning_rate": 5.400440852461517e-06,
"loss": 0.08446044921875,
"memory(GiB)": 10.57,
"step": 1645,
"train_speed(iter/s)": 0.473198
},
{
"epoch": 1.2323930506258174,
"grad_norm": 0.30926667434610317,
"learning_rate": 5.3969180815543075e-06,
"loss": 0.0973876953125,
"memory(GiB)": 10.57,
"step": 1650,
"train_speed(iter/s)": 0.473211
},
{
"epoch": 1.236129273304689,
"grad_norm": 0.22376369134309534,
"learning_rate": 5.393386147827153e-06,
"loss": 0.08917236328125,
"memory(GiB)": 10.57,
"step": 1655,
"train_speed(iter/s)": 0.473219
},
{
"epoch": 1.2398654959835607,
"grad_norm": 0.3060981242994768,
"learning_rate": 5.3898450647817534e-06,
"loss": 0.095660400390625,
"memory(GiB)": 10.57,
"step": 1660,
"train_speed(iter/s)": 0.47326
},
{
"epoch": 1.2436017186624322,
"grad_norm": 0.2824418483688286,
"learning_rate": 5.386294845954789e-06,
"loss": 0.093310546875,
"memory(GiB)": 10.57,
"step": 1665,
"train_speed(iter/s)": 0.473272
},
{
"epoch": 1.247337941341304,
"grad_norm": 0.36318507390627536,
"learning_rate": 5.382735504917859e-06,
"loss": 0.09969482421875,
"memory(GiB)": 10.57,
"step": 1670,
"train_speed(iter/s)": 0.473338
},
{
"epoch": 1.2510741640201757,
"grad_norm": 0.25998406554963555,
"learning_rate": 5.379167055277436e-06,
"loss": 0.0906982421875,
"memory(GiB)": 10.57,
"step": 1675,
"train_speed(iter/s)": 0.473289
},
{
"epoch": 1.2548103866990472,
"grad_norm": 0.3053060614623874,
"learning_rate": 5.3755895106748135e-06,
"loss": 0.1009033203125,
"memory(GiB)": 10.57,
"step": 1680,
"train_speed(iter/s)": 0.473278
},
{
"epoch": 1.258546609377919,
"grad_norm": 0.3304211891993834,
"learning_rate": 5.372002884786053e-06,
"loss": 0.080206298828125,
"memory(GiB)": 10.57,
"step": 1685,
"train_speed(iter/s)": 0.473247
},
{
"epoch": 1.2622828320567905,
"grad_norm": 0.3786132572419238,
"learning_rate": 5.368407191321929e-06,
"loss": 0.11483154296875,
"memory(GiB)": 10.57,
"step": 1690,
"train_speed(iter/s)": 0.473224
},
{
"epoch": 1.2660190547356622,
"grad_norm": 0.4098142898034233,
"learning_rate": 5.364802444027881e-06,
"loss": 0.11900634765625,
"memory(GiB)": 10.57,
"step": 1695,
"train_speed(iter/s)": 0.473255
},
{
"epoch": 1.269755277414534,
"grad_norm": 0.31832382239724993,
"learning_rate": 5.36118865668396e-06,
"loss": 0.100079345703125,
"memory(GiB)": 10.57,
"step": 1700,
"train_speed(iter/s)": 0.473256
},
{
"epoch": 1.2734915000934055,
"grad_norm": 0.21787448497633385,
"learning_rate": 5.357565843104772e-06,
"loss": 0.1089111328125,
"memory(GiB)": 10.57,
"step": 1705,
"train_speed(iter/s)": 0.473319
},
{
"epoch": 1.2772277227722773,
"grad_norm": 0.24048814888237727,
"learning_rate": 5.3539340171394315e-06,
"loss": 0.103173828125,
"memory(GiB)": 10.57,
"step": 1710,
"train_speed(iter/s)": 0.473382
},
{
"epoch": 1.2809639454511488,
"grad_norm": 0.2628088064912976,
"learning_rate": 5.350293192671502e-06,
"loss": 0.1017578125,
"memory(GiB)": 10.57,
"step": 1715,
"train_speed(iter/s)": 0.473392
},
{
"epoch": 1.2847001681300205,
"grad_norm": 0.19682320473371387,
"learning_rate": 5.3466433836189466e-06,
"loss": 0.10618896484375,
"memory(GiB)": 10.57,
"step": 1720,
"train_speed(iter/s)": 0.473367
},
{
"epoch": 1.2884363908088923,
"grad_norm": 0.31166282334428463,
"learning_rate": 5.342984603934075e-06,
"loss": 0.0931884765625,
"memory(GiB)": 10.57,
"step": 1725,
"train_speed(iter/s)": 0.473394
},
{
"epoch": 1.2921726134877638,
"grad_norm": 0.4426055463824898,
"learning_rate": 5.3393168676034925e-06,
"loss": 0.10029296875,
"memory(GiB)": 10.57,
"step": 1730,
"train_speed(iter/s)": 0.473469
},
{
"epoch": 1.2959088361666355,
"grad_norm": 0.19012257878940111,
"learning_rate": 5.335640188648036e-06,
"loss": 0.0994873046875,
"memory(GiB)": 10.57,
"step": 1735,
"train_speed(iter/s)": 0.473506
},
{
"epoch": 1.299645058845507,
"grad_norm": 0.2509436471905221,
"learning_rate": 5.3319545811227345e-06,
"loss": 0.10556640625,
"memory(GiB)": 10.57,
"step": 1740,
"train_speed(iter/s)": 0.47352
},
{
"epoch": 1.3033812815243788,
"grad_norm": 0.30945571438082825,
"learning_rate": 5.328260059116746e-06,
"loss": 0.10347900390625,
"memory(GiB)": 10.57,
"step": 1745,
"train_speed(iter/s)": 0.473584
},
{
"epoch": 1.3071175042032506,
"grad_norm": 0.2842323038315994,
"learning_rate": 5.324556636753305e-06,
"loss": 0.0927490234375,
"memory(GiB)": 10.57,
"step": 1750,
"train_speed(iter/s)": 0.473591
},
{
"epoch": 1.310853726882122,
"grad_norm": 0.22529076559497616,
"learning_rate": 5.320844328189674e-06,
"loss": 0.10736083984375,
"memory(GiB)": 10.57,
"step": 1755,
"train_speed(iter/s)": 0.473593
},
{
"epoch": 1.3145899495609938,
"grad_norm": 0.25966109665415044,
"learning_rate": 5.31712314761708e-06,
"loss": 0.09718017578125,
"memory(GiB)": 10.57,
"step": 1760,
"train_speed(iter/s)": 0.473634
},
{
"epoch": 1.3183261722398654,
"grad_norm": 0.2824285315852678,
"learning_rate": 5.31339310926067e-06,
"loss": 0.1147216796875,
"memory(GiB)": 10.57,
"step": 1765,
"train_speed(iter/s)": 0.473682
},
{
"epoch": 1.3220623949187371,
"grad_norm": 0.29212776422688475,
"learning_rate": 5.30965422737945e-06,
"loss": 0.106103515625,
"memory(GiB)": 10.57,
"step": 1770,
"train_speed(iter/s)": 0.473711
},
{
"epoch": 1.3257986175976089,
"grad_norm": 0.21770600045083738,
"learning_rate": 5.305906516266232e-06,
"loss": 0.09356689453125,
"memory(GiB)": 10.57,
"step": 1775,
"train_speed(iter/s)": 0.473749
},
{
"epoch": 1.3295348402764806,
"grad_norm": 0.22535805175359133,
"learning_rate": 5.302149990247581e-06,
"loss": 0.09854736328125,
"memory(GiB)": 10.57,
"step": 1780,
"train_speed(iter/s)": 0.47377
},
{
"epoch": 1.3332710629553521,
"grad_norm": 0.3731424208017629,
"learning_rate": 5.298384663683759e-06,
"loss": 0.10096435546875,
"memory(GiB)": 10.57,
"step": 1785,
"train_speed(iter/s)": 0.473814
},
{
"epoch": 1.3370072856342237,
"grad_norm": 0.19409382195361594,
"learning_rate": 5.29461055096867e-06,
"loss": 0.0933837890625,
"memory(GiB)": 10.57,
"step": 1790,
"train_speed(iter/s)": 0.473846
},
{
"epoch": 1.3407435083130954,
"grad_norm": 0.20858019331443553,
"learning_rate": 5.290827666529807e-06,
"loss": 0.09691162109375,
"memory(GiB)": 10.57,
"step": 1795,
"train_speed(iter/s)": 0.473812
},
{
"epoch": 1.3444797309919672,
"grad_norm": 0.21508957217260072,
"learning_rate": 5.287036024828191e-06,
"loss": 0.112396240234375,
"memory(GiB)": 10.57,
"step": 1800,
"train_speed(iter/s)": 0.473874
},
{
"epoch": 1.348215953670839,
"grad_norm": 0.21088809922179003,
"learning_rate": 5.283235640358326e-06,
"loss": 0.10013427734375,
"memory(GiB)": 10.57,
"step": 1805,
"train_speed(iter/s)": 0.473898
},
{
"epoch": 1.3519521763497104,
"grad_norm": 0.2980687891825392,
"learning_rate": 5.27942652764813e-06,
"loss": 0.12469482421875,
"memory(GiB)": 10.57,
"step": 1810,
"train_speed(iter/s)": 0.473908
},
{
"epoch": 1.3556883990285822,
"grad_norm": 0.26579488787728855,
"learning_rate": 5.275608701258893e-06,
"loss": 0.09619140625,
"memory(GiB)": 10.57,
"step": 1815,
"train_speed(iter/s)": 0.473922
},
{
"epoch": 1.3594246217074537,
"grad_norm": 0.18737292024034827,
"learning_rate": 5.271782175785213e-06,
"loss": 0.08944091796875,
"memory(GiB)": 10.57,
"step": 1820,
"train_speed(iter/s)": 0.473933
},
{
"epoch": 1.3631608443863255,
"grad_norm": 0.24782345412701354,
"learning_rate": 5.2679469658549425e-06,
"loss": 0.09827880859375,
"memory(GiB)": 10.57,
"step": 1825,
"train_speed(iter/s)": 0.473873
},
{
"epoch": 1.3668970670651972,
"grad_norm": 0.32532596436786243,
"learning_rate": 5.26410308612913e-06,
"loss": 0.09747314453125,
"memory(GiB)": 10.57,
"step": 1830,
"train_speed(iter/s)": 0.473915
},
{
"epoch": 1.3706332897440687,
"grad_norm": 0.31097616250716587,
"learning_rate": 5.2602505513019725e-06,
"loss": 0.1041748046875,
"memory(GiB)": 10.57,
"step": 1835,
"train_speed(iter/s)": 0.473886
},
{
"epoch": 1.3743695124229405,
"grad_norm": 0.3233980057122036,
"learning_rate": 5.256389376100747e-06,
"loss": 0.10128173828125,
"memory(GiB)": 10.57,
"step": 1840,
"train_speed(iter/s)": 0.473889
},
{
"epoch": 1.378105735101812,
"grad_norm": 0.2838217794938913,
"learning_rate": 5.252519575285765e-06,
"loss": 0.10989990234375,
"memory(GiB)": 10.57,
"step": 1845,
"train_speed(iter/s)": 0.473918
},
{
"epoch": 1.3818419577806837,
"grad_norm": 0.2857844265885774,
"learning_rate": 5.248641163650309e-06,
"loss": 0.101458740234375,
"memory(GiB)": 10.57,
"step": 1850,
"train_speed(iter/s)": 0.473944
},
{
"epoch": 1.3855781804595555,
"grad_norm": 0.3117055756844236,
"learning_rate": 5.244754156020577e-06,
"loss": 0.10926513671875,
"memory(GiB)": 10.57,
"step": 1855,
"train_speed(iter/s)": 0.473967
},
{
"epoch": 1.389314403138427,
"grad_norm": 0.1920114429204594,
"learning_rate": 5.240858567255634e-06,
"loss": 0.110009765625,
"memory(GiB)": 10.57,
"step": 1860,
"train_speed(iter/s)": 0.473962
},
{
"epoch": 1.3930506258172988,
"grad_norm": 0.3502090927498937,
"learning_rate": 5.236954412247341e-06,
"loss": 0.11763916015625,
"memory(GiB)": 10.57,
"step": 1865,
"train_speed(iter/s)": 0.473991
},
{
"epoch": 1.3967868484961703,
"grad_norm": 0.23316922643496588,
"learning_rate": 5.2330417059203095e-06,
"loss": 0.1151123046875,
"memory(GiB)": 10.57,
"step": 1870,
"train_speed(iter/s)": 0.474034
},
{
"epoch": 1.400523071175042,
"grad_norm": 0.2549951722054464,
"learning_rate": 5.22912046323184e-06,
"loss": 0.110504150390625,
"memory(GiB)": 10.57,
"step": 1875,
"train_speed(iter/s)": 0.474059
},
{
"epoch": 1.4042592938539138,
"grad_norm": 0.1708829919522614,
"learning_rate": 5.225190699171865e-06,
"loss": 0.08787841796875,
"memory(GiB)": 10.57,
"step": 1880,
"train_speed(iter/s)": 0.474029
},
{
"epoch": 1.4079955165327853,
"grad_norm": 0.27196811779503416,
"learning_rate": 5.221252428762893e-06,
"loss": 0.11351318359375,
"memory(GiB)": 10.57,
"step": 1885,
"train_speed(iter/s)": 0.474046
},
{
"epoch": 1.411731739211657,
"grad_norm": 0.23328619371671638,
"learning_rate": 5.217305667059948e-06,
"loss": 0.101446533203125,
"memory(GiB)": 10.57,
"step": 1890,
"train_speed(iter/s)": 0.474076
},
{
"epoch": 1.4154679618905286,
"grad_norm": 0.18762276770097455,
"learning_rate": 5.213350429150517e-06,
"loss": 0.10950927734375,
"memory(GiB)": 10.57,
"step": 1895,
"train_speed(iter/s)": 0.474023
},
{
"epoch": 1.4192041845694003,
"grad_norm": 0.26686273546353123,
"learning_rate": 5.209386730154487e-06,
"loss": 0.10045166015625,
"memory(GiB)": 10.57,
"step": 1900,
"train_speed(iter/s)": 0.474076
},
{
"epoch": 1.422940407248272,
"grad_norm": 0.3085786825020616,
"learning_rate": 5.205414585224091e-06,
"loss": 0.10711669921875,
"memory(GiB)": 10.57,
"step": 1905,
"train_speed(iter/s)": 0.474098
},
{
"epoch": 1.4266766299271436,
"grad_norm": 0.3905887360768796,
"learning_rate": 5.2014340095438476e-06,
"loss": 0.118505859375,
"memory(GiB)": 10.57,
"step": 1910,
"train_speed(iter/s)": 0.474116
},
{
"epoch": 1.4304128526060154,
"grad_norm": 0.2752084354347657,
"learning_rate": 5.197445018330506e-06,
"loss": 0.09713134765625,
"memory(GiB)": 10.57,
"step": 1915,
"train_speed(iter/s)": 0.47414
},
{
"epoch": 1.4341490752848869,
"grad_norm": 0.25638122340507086,
"learning_rate": 5.193447626832984e-06,
"loss": 0.1004638671875,
"memory(GiB)": 10.57,
"step": 1920,
"train_speed(iter/s)": 0.474127
},
{
"epoch": 1.4378852979637586,
"grad_norm": 0.3365573737926719,
"learning_rate": 5.189441850332312e-06,
"loss": 0.096502685546875,
"memory(GiB)": 10.57,
"step": 1925,
"train_speed(iter/s)": 0.474083
},
{
"epoch": 1.4416215206426304,
"grad_norm": 0.1924187499510245,
"learning_rate": 5.185427704141573e-06,
"loss": 0.124609375,
"memory(GiB)": 10.57,
"step": 1930,
"train_speed(iter/s)": 0.474111
},
{
"epoch": 1.445357743321502,
"grad_norm": 0.28660368393049557,
"learning_rate": 5.181405203605849e-06,
"loss": 0.10279541015625,
"memory(GiB)": 10.57,
"step": 1935,
"train_speed(iter/s)": 0.474107
},
{
"epoch": 1.4490939660003737,
"grad_norm": 0.26275748472823024,
"learning_rate": 5.177374364102156e-06,
"loss": 0.1211669921875,
"memory(GiB)": 10.57,
"step": 1940,
"train_speed(iter/s)": 0.474117
},
{
"epoch": 1.4528301886792452,
"grad_norm": 0.30473179680325724,
"learning_rate": 5.1733352010393855e-06,
"loss": 0.1116455078125,
"memory(GiB)": 10.57,
"step": 1945,
"train_speed(iter/s)": 0.474149
},
{
"epoch": 1.456566411358117,
"grad_norm": 0.2980857699329149,
"learning_rate": 5.169287729858254e-06,
"loss": 0.09521484375,
"memory(GiB)": 10.57,
"step": 1950,
"train_speed(iter/s)": 0.474117
},
{
"epoch": 1.4603026340369887,
"grad_norm": 0.3892418519621433,
"learning_rate": 5.165231966031231e-06,
"loss": 0.10706787109375,
"memory(GiB)": 10.57,
"step": 1955,
"train_speed(iter/s)": 0.474167
},
{
"epoch": 1.4640388567158602,
"grad_norm": 0.26876863290437225,
"learning_rate": 5.161167925062492e-06,
"loss": 0.0955810546875,
"memory(GiB)": 10.57,
"step": 1960,
"train_speed(iter/s)": 0.474231
},
{
"epoch": 1.467775079394732,
"grad_norm": 0.23766298983672868,
"learning_rate": 5.15709562248785e-06,
"loss": 0.1157470703125,
"memory(GiB)": 10.57,
"step": 1965,
"train_speed(iter/s)": 0.474264
},
{
"epoch": 1.4715113020736035,
"grad_norm": 0.2475077256620063,
"learning_rate": 5.153015073874704e-06,
"loss": 0.103997802734375,
"memory(GiB)": 10.57,
"step": 1970,
"train_speed(iter/s)": 0.474248
},
{
"epoch": 1.4752475247524752,
"grad_norm": 0.2529463798672503,
"learning_rate": 5.148926294821973e-06,
"loss": 0.09212646484375,
"memory(GiB)": 10.57,
"step": 1975,
"train_speed(iter/s)": 0.474282
},
{
"epoch": 1.478983747431347,
"grad_norm": 0.34121952234096015,
"learning_rate": 5.144829300960038e-06,
"loss": 0.09998779296875,
"memory(GiB)": 10.57,
"step": 1980,
"train_speed(iter/s)": 0.474279
},
{
"epoch": 1.4827199701102185,
"grad_norm": 0.26555171567768715,
"learning_rate": 5.140724107950687e-06,
"loss": 0.10701904296875,
"memory(GiB)": 10.57,
"step": 1985,
"train_speed(iter/s)": 0.474325
},
{
"epoch": 1.4864561927890902,
"grad_norm": 0.3012526382519,
"learning_rate": 5.136610731487047e-06,
"loss": 0.10223388671875,
"memory(GiB)": 10.57,
"step": 1990,
"train_speed(iter/s)": 0.474388
},
{
"epoch": 1.4901924154679618,
"grad_norm": 0.2585567492074306,
"learning_rate": 5.13248918729353e-06,
"loss": 0.110015869140625,
"memory(GiB)": 10.57,
"step": 1995,
"train_speed(iter/s)": 0.474458
},
{
"epoch": 1.4939286381468335,
"grad_norm": 0.21553275657329446,
"learning_rate": 5.128359491125772e-06,
"loss": 0.10537109375,
"memory(GiB)": 10.57,
"step": 2000,
"train_speed(iter/s)": 0.474436
},
{
"epoch": 1.4976648608257053,
"grad_norm": 0.23393892148099255,
"learning_rate": 5.1242216587705726e-06,
"loss": 0.09471435546875,
"memory(GiB)": 10.57,
"step": 2005,
"train_speed(iter/s)": 0.474455
},
{
"epoch": 1.501401083504577,
"grad_norm": 0.1982523301744199,
"learning_rate": 5.1200757060458305e-06,
"loss": 0.094744873046875,
"memory(GiB)": 10.57,
"step": 2010,
"train_speed(iter/s)": 0.47441
},
{
"epoch": 1.5051373061834485,
"grad_norm": 0.24897221547603635,
"learning_rate": 5.11592164880049e-06,
"loss": 0.094281005859375,
"memory(GiB)": 10.57,
"step": 2015,
"train_speed(iter/s)": 0.474368
},
{
"epoch": 1.50887352886232,
"grad_norm": 0.2524388493286587,
"learning_rate": 5.111759502914477e-06,
"loss": 0.10567626953125,
"memory(GiB)": 10.57,
"step": 2020,
"train_speed(iter/s)": 0.474413
},
{
"epoch": 1.5126097515411918,
"grad_norm": 0.2821918241104093,
"learning_rate": 5.107589284298635e-06,
"loss": 0.10643310546875,
"memory(GiB)": 10.57,
"step": 2025,
"train_speed(iter/s)": 0.474445
},
{
"epoch": 1.5163459742200636,
"grad_norm": 0.1949063316633063,
"learning_rate": 5.10341100889467e-06,
"loss": 0.10220947265625,
"memory(GiB)": 10.57,
"step": 2030,
"train_speed(iter/s)": 0.474452
},
{
"epoch": 1.5200821968989353,
"grad_norm": 0.2664640714650226,
"learning_rate": 5.0992246926750866e-06,
"loss": 0.1039306640625,
"memory(GiB)": 10.57,
"step": 2035,
"train_speed(iter/s)": 0.474496
},
{
"epoch": 1.5238184195778068,
"grad_norm": 0.15706118788240764,
"learning_rate": 5.095030351643129e-06,
"loss": 0.0922210693359375,
"memory(GiB)": 10.57,
"step": 2040,
"train_speed(iter/s)": 0.474486
},
{
"epoch": 1.5275546422566784,
"grad_norm": 0.2929327425758124,
"learning_rate": 5.090828001832715e-06,
"loss": 0.1028076171875,
"memory(GiB)": 10.57,
"step": 2045,
"train_speed(iter/s)": 0.474519
},
{
"epoch": 1.5312908649355501,
"grad_norm": 0.21976773396934837,
"learning_rate": 5.0866176593083805e-06,
"loss": 0.1067626953125,
"memory(GiB)": 10.57,
"step": 2050,
"train_speed(iter/s)": 0.474561
},
{
"epoch": 1.5350270876144219,
"grad_norm": 0.22682626802364397,
"learning_rate": 5.082399340165214e-06,
"loss": 0.10389404296875,
"memory(GiB)": 10.57,
"step": 2055,
"train_speed(iter/s)": 0.474594
},
{
"epoch": 1.5387633102932936,
"grad_norm": 0.2279293975450204,
"learning_rate": 5.0781730605287985e-06,
"loss": 0.102423095703125,
"memory(GiB)": 10.57,
"step": 2060,
"train_speed(iter/s)": 0.474651
},
{
"epoch": 1.5424995329721651,
"grad_norm": 0.21127637298228888,
"learning_rate": 5.073938836555145e-06,
"loss": 0.11668701171875,
"memory(GiB)": 10.57,
"step": 2065,
"train_speed(iter/s)": 0.474653
},
{
"epoch": 1.5462357556510367,
"grad_norm": 0.23883103143189194,
"learning_rate": 5.069696684430639e-06,
"loss": 0.10777587890625,
"memory(GiB)": 10.57,
"step": 2070,
"train_speed(iter/s)": 0.474569
},
{
"epoch": 1.5499719783299084,
"grad_norm": 0.19708822331757736,
"learning_rate": 5.065446620371966e-06,
"loss": 0.10965576171875,
"memory(GiB)": 10.57,
"step": 2075,
"train_speed(iter/s)": 0.474611
},
{
"epoch": 1.5537082010087802,
"grad_norm": 0.22428614901572544,
"learning_rate": 5.061188660626064e-06,
"loss": 0.08321533203125,
"memory(GiB)": 10.57,
"step": 2080,
"train_speed(iter/s)": 0.474592
},
{
"epoch": 1.557444423687652,
"grad_norm": 0.23095421524064055,
"learning_rate": 5.056922821470048e-06,
"loss": 0.1009521484375,
"memory(GiB)": 10.57,
"step": 2085,
"train_speed(iter/s)": 0.474622
},
{
"epoch": 1.5611806463665234,
"grad_norm": 0.2871546333696532,
"learning_rate": 5.052649119211159e-06,
"loss": 0.1187744140625,
"memory(GiB)": 10.57,
"step": 2090,
"train_speed(iter/s)": 0.474601
},
{
"epoch": 1.564916869045395,
"grad_norm": 0.21613086763978323,
"learning_rate": 5.048367570186694e-06,
"loss": 0.1031494140625,
"memory(GiB)": 10.57,
"step": 2095,
"train_speed(iter/s)": 0.474627
},
{
"epoch": 1.5686530917242667,
"grad_norm": 0.2485805730125251,
"learning_rate": 5.044078190763949e-06,
"loss": 0.09178466796875,
"memory(GiB)": 10.57,
"step": 2100,
"train_speed(iter/s)": 0.474608
},
{
"epoch": 1.5723893144031384,
"grad_norm": 0.2501433468360814,
"learning_rate": 5.039780997340148e-06,
"loss": 0.096502685546875,
"memory(GiB)": 10.57,
"step": 2105,
"train_speed(iter/s)": 0.474639
},
{
"epoch": 1.5761255370820102,
"grad_norm": 0.2625314288905634,
"learning_rate": 5.035476006342392e-06,
"loss": 0.12071533203125,
"memory(GiB)": 10.57,
"step": 2110,
"train_speed(iter/s)": 0.474666
},
{
"epoch": 1.5798617597608817,
"grad_norm": 0.25070127552544946,
"learning_rate": 5.031163234227587e-06,
"loss": 0.102880859375,
"memory(GiB)": 10.57,
"step": 2115,
"train_speed(iter/s)": 0.474731
},
{
"epoch": 1.5835979824397532,
"grad_norm": 0.2730775843332172,
"learning_rate": 5.026842697482386e-06,
"loss": 0.107745361328125,
"memory(GiB)": 10.57,
"step": 2120,
"train_speed(iter/s)": 0.47469
},
{
"epoch": 1.587334205118625,
"grad_norm": 0.3168533915295129,
"learning_rate": 5.022514412623122e-06,
"loss": 0.10606689453125,
"memory(GiB)": 10.57,
"step": 2125,
"train_speed(iter/s)": 0.474712
},
{
"epoch": 1.5910704277974967,
"grad_norm": 0.26414617810461144,
"learning_rate": 5.018178396195749e-06,
"loss": 0.114739990234375,
"memory(GiB)": 10.57,
"step": 2130,
"train_speed(iter/s)": 0.474667
},
{
"epoch": 1.5948066504763685,
"grad_norm": 0.2884403060168701,
"learning_rate": 5.013834664775775e-06,
"loss": 0.09578857421875,
"memory(GiB)": 10.57,
"step": 2135,
"train_speed(iter/s)": 0.474686
},
{
"epoch": 1.59854287315524,
"grad_norm": 0.17316814005290654,
"learning_rate": 5.009483234968204e-06,
"loss": 0.09461669921875,
"memory(GiB)": 10.57,
"step": 2140,
"train_speed(iter/s)": 0.474718
},
{
"epoch": 1.6022790958341118,
"grad_norm": 0.20180870823591296,
"learning_rate": 5.005124123407466e-06,
"loss": 0.1016357421875,
"memory(GiB)": 10.57,
"step": 2145,
"train_speed(iter/s)": 0.474763
},
{
"epoch": 1.6060153185129833,
"grad_norm": 0.28225684517263877,
"learning_rate": 5.0007573467573556e-06,
"loss": 0.0999755859375,
"memory(GiB)": 10.57,
"step": 2150,
"train_speed(iter/s)": 0.474781
},
{
"epoch": 1.609751541191855,
"grad_norm": 0.11744325613491245,
"learning_rate": 4.996382921710973e-06,
"loss": 0.088720703125,
"memory(GiB)": 10.57,
"step": 2155,
"train_speed(iter/s)": 0.474755
},
{
"epoch": 1.6134877638707268,
"grad_norm": 0.34760100976149216,
"learning_rate": 4.992000864990652e-06,
"loss": 0.112939453125,
"memory(GiB)": 10.57,
"step": 2160,
"train_speed(iter/s)": 0.474772
},
{
"epoch": 1.6172239865495983,
"grad_norm": 0.22604747445071158,
"learning_rate": 4.987611193347903e-06,
"loss": 0.089892578125,
"memory(GiB)": 10.57,
"step": 2165,
"train_speed(iter/s)": 0.474717
},
{
"epoch": 1.62096020922847,
"grad_norm": 0.28280682170193416,
"learning_rate": 4.983213923563347e-06,
"loss": 0.0989990234375,
"memory(GiB)": 10.57,
"step": 2170,
"train_speed(iter/s)": 0.474738
},
{
"epoch": 1.6246964319073416,
"grad_norm": 0.22814666006274306,
"learning_rate": 4.978809072446648e-06,
"loss": 0.0938232421875,
"memory(GiB)": 10.57,
"step": 2175,
"train_speed(iter/s)": 0.474723
},
{
"epoch": 1.6284326545862133,
"grad_norm": 0.26304826342931886,
"learning_rate": 4.974396656836454e-06,
"loss": 0.09578857421875,
"memory(GiB)": 10.57,
"step": 2180,
"train_speed(iter/s)": 0.474661
},
{
"epoch": 1.632168877265085,
"grad_norm": 0.3174530542273234,
"learning_rate": 4.969976693600328e-06,
"loss": 0.08758544921875,
"memory(GiB)": 10.57,
"step": 2185,
"train_speed(iter/s)": 0.474686
},
{
"epoch": 1.6359050999439566,
"grad_norm": 0.2533342016854265,
"learning_rate": 4.965549199634688e-06,
"loss": 0.095849609375,
"memory(GiB)": 10.57,
"step": 2190,
"train_speed(iter/s)": 0.474707
},
{
"epoch": 1.6396413226228284,
"grad_norm": 0.2795419703573222,
"learning_rate": 4.96111419186474e-06,
"loss": 0.09959716796875,
"memory(GiB)": 10.57,
"step": 2195,
"train_speed(iter/s)": 0.474746
},
{
"epoch": 1.6433775453016999,
"grad_norm": 0.2244253656669392,
"learning_rate": 4.95667168724441e-06,
"loss": 0.103564453125,
"memory(GiB)": 10.57,
"step": 2200,
"train_speed(iter/s)": 0.474702
},
{
"epoch": 1.6471137679805716,
"grad_norm": 0.2568324687784542,
"learning_rate": 4.952221702756288e-06,
"loss": 0.1037445068359375,
"memory(GiB)": 10.57,
"step": 2205,
"train_speed(iter/s)": 0.474722
},
{
"epoch": 1.6508499906594434,
"grad_norm": 0.3956651516840788,
"learning_rate": 4.947764255411551e-06,
"loss": 0.11588134765625,
"memory(GiB)": 10.57,
"step": 2210,
"train_speed(iter/s)": 0.474738
},
{
"epoch": 1.6545862133383151,
"grad_norm": 0.20985100077876295,
"learning_rate": 4.943299362249912e-06,
"loss": 0.099951171875,
"memory(GiB)": 10.57,
"step": 2215,
"train_speed(iter/s)": 0.474773
},
{
"epoch": 1.6583224360171867,
"grad_norm": 0.1962140667346041,
"learning_rate": 4.9388270403395415e-06,
"loss": 0.10343017578125,
"memory(GiB)": 10.57,
"step": 2220,
"train_speed(iter/s)": 0.474776
},
{
"epoch": 1.6620586586960582,
"grad_norm": 0.22503137462618433,
"learning_rate": 4.934347306777012e-06,
"loss": 0.1007568359375,
"memory(GiB)": 10.57,
"step": 2225,
"train_speed(iter/s)": 0.474752
},
{
"epoch": 1.66579488137493,
"grad_norm": 0.22195673002837232,
"learning_rate": 4.929860178687226e-06,
"loss": 0.091131591796875,
"memory(GiB)": 10.57,
"step": 2230,
"train_speed(iter/s)": 0.474771
},
{
"epoch": 1.6695311040538017,
"grad_norm": 0.3168855098173885,
"learning_rate": 4.9253656732233564e-06,
"loss": 0.11160888671875,
"memory(GiB)": 10.57,
"step": 2235,
"train_speed(iter/s)": 0.474768
},
{
"epoch": 1.6732673267326734,
"grad_norm": 0.1738888875381385,
"learning_rate": 4.920863807566776e-06,
"loss": 0.0958465576171875,
"memory(GiB)": 10.57,
"step": 2240,
"train_speed(iter/s)": 0.474725
},
{
"epoch": 1.677003549411545,
"grad_norm": 0.2552273932950652,
"learning_rate": 4.9163545989269944e-06,
"loss": 0.09219970703125,
"memory(GiB)": 10.57,
"step": 2245,
"train_speed(iter/s)": 0.474729
},
{
"epoch": 1.6807397720904165,
"grad_norm": 0.3060989271500881,
"learning_rate": 4.9118380645415905e-06,
"loss": 0.100439453125,
"memory(GiB)": 10.57,
"step": 2250,
"train_speed(iter/s)": 0.474737
},
{
"epoch": 1.6844759947692882,
"grad_norm": 0.2949704093412238,
"learning_rate": 4.907314221676149e-06,
"loss": 0.102716064453125,
"memory(GiB)": 10.57,
"step": 2255,
"train_speed(iter/s)": 0.474753
},
{
"epoch": 1.68821221744816,
"grad_norm": 0.28246484565713104,
"learning_rate": 4.902783087624195e-06,
"loss": 0.104339599609375,
"memory(GiB)": 10.57,
"step": 2260,
"train_speed(iter/s)": 0.474772
},
{
"epoch": 1.6919484401270317,
"grad_norm": 0.2912739109964812,
"learning_rate": 4.89824467970712e-06,
"loss": 0.09698486328125,
"memory(GiB)": 10.57,
"step": 2265,
"train_speed(iter/s)": 0.474766
},
{
"epoch": 1.6956846628059032,
"grad_norm": 0.20297905907906486,
"learning_rate": 4.8936990152741276e-06,
"loss": 0.10142822265625,
"memory(GiB)": 10.57,
"step": 2270,
"train_speed(iter/s)": 0.474788
},
{
"epoch": 1.6994208854847748,
"grad_norm": 0.27675872548007086,
"learning_rate": 4.88914611170216e-06,
"loss": 0.11038818359375,
"memory(GiB)": 10.57,
"step": 2275,
"train_speed(iter/s)": 0.474807
},
{
"epoch": 1.7031571081636465,
"grad_norm": 0.26312724669069576,
"learning_rate": 4.88458598639583e-06,
"loss": 0.10172119140625,
"memory(GiB)": 10.57,
"step": 2280,
"train_speed(iter/s)": 0.474842
},
{
"epoch": 1.7068933308425183,
"grad_norm": 0.2905331610134025,
"learning_rate": 4.880018656787359e-06,
"loss": 0.09381103515625,
"memory(GiB)": 10.57,
"step": 2285,
"train_speed(iter/s)": 0.474842
},
{
"epoch": 1.71062955352139,
"grad_norm": 0.34444149002078045,
"learning_rate": 4.8754441403365105e-06,
"loss": 0.1239501953125,
"memory(GiB)": 10.57,
"step": 2290,
"train_speed(iter/s)": 0.47486
},
{
"epoch": 1.7143657762002615,
"grad_norm": 0.2738462078711773,
"learning_rate": 4.8708624545305185e-06,
"loss": 0.0885498046875,
"memory(GiB)": 10.57,
"step": 2295,
"train_speed(iter/s)": 0.474827
},
{
"epoch": 1.718101998879133,
"grad_norm": 0.28959854575833754,
"learning_rate": 4.866273616884027e-06,
"loss": 0.11025390625,
"memory(GiB)": 10.57,
"step": 2300,
"train_speed(iter/s)": 0.474849
},
{
"epoch": 1.7218382215580048,
"grad_norm": 0.20588142938995796,
"learning_rate": 4.861677644939015e-06,
"loss": 0.08424072265625,
"memory(GiB)": 10.57,
"step": 2305,
"train_speed(iter/s)": 0.474856
},
{
"epoch": 1.7255744442368766,
"grad_norm": 0.3354441601677246,
"learning_rate": 4.857074556264738e-06,
"loss": 0.1094970703125,
"memory(GiB)": 10.57,
"step": 2310,
"train_speed(iter/s)": 0.474867
},
{
"epoch": 1.7293106669157483,
"grad_norm": 0.20426806575301326,
"learning_rate": 4.852464368457656e-06,
"loss": 0.10550537109375,
"memory(GiB)": 10.57,
"step": 2315,
"train_speed(iter/s)": 0.474874
},
{
"epoch": 1.7330468895946198,
"grad_norm": 0.23904264143395532,
"learning_rate": 4.8478470991413675e-06,
"loss": 0.086602783203125,
"memory(GiB)": 10.57,
"step": 2320,
"train_speed(iter/s)": 0.474876
},
{
"epoch": 1.7367831122734914,
"grad_norm": 0.22442760094437317,
"learning_rate": 4.84322276596654e-06,
"loss": 0.10830078125,
"memory(GiB)": 10.57,
"step": 2325,
"train_speed(iter/s)": 0.4749
},
{
"epoch": 1.740519334952363,
"grad_norm": 0.22627089113762092,
"learning_rate": 4.838591386610846e-06,
"loss": 0.0934814453125,
"memory(GiB)": 10.57,
"step": 2330,
"train_speed(iter/s)": 0.474923
},
{
"epoch": 1.7442555576312349,
"grad_norm": 0.212873273345035,
"learning_rate": 4.833952978778896e-06,
"loss": 0.10042724609375,
"memory(GiB)": 10.57,
"step": 2335,
"train_speed(iter/s)": 0.474953
},
{
"epoch": 1.7479917803101066,
"grad_norm": 0.310168401865503,
"learning_rate": 4.829307560202164e-06,
"loss": 0.090283203125,
"memory(GiB)": 10.57,
"step": 2340,
"train_speed(iter/s)": 0.47497
},
{
"epoch": 1.7517280029889781,
"grad_norm": 0.25363080821630596,
"learning_rate": 4.824655148638925e-06,
"loss": 0.09075927734375,
"memory(GiB)": 10.57,
"step": 2345,
"train_speed(iter/s)": 0.474997
},
{
"epoch": 1.7554642256678497,
"grad_norm": 0.2287201903267125,
"learning_rate": 4.81999576187419e-06,
"loss": 0.122119140625,
"memory(GiB)": 10.57,
"step": 2350,
"train_speed(iter/s)": 0.474997
},
{
"epoch": 1.7592004483467214,
"grad_norm": 0.360999021305386,
"learning_rate": 4.815329417719632e-06,
"loss": 0.11300048828125,
"memory(GiB)": 10.57,
"step": 2355,
"train_speed(iter/s)": 0.474979
},
{
"epoch": 1.7629366710255931,
"grad_norm": 0.2535783044832626,
"learning_rate": 4.810656134013522e-06,
"loss": 0.108135986328125,
"memory(GiB)": 10.57,
"step": 2360,
"train_speed(iter/s)": 0.474978
},
{
"epoch": 1.766672893704465,
"grad_norm": 0.32574474831453987,
"learning_rate": 4.805975928620656e-06,
"loss": 0.10255126953125,
"memory(GiB)": 10.57,
"step": 2365,
"train_speed(iter/s)": 0.47493
},
{
"epoch": 1.7704091163833364,
"grad_norm": 0.19234656846328618,
"learning_rate": 4.801288819432292e-06,
"loss": 0.10970458984375,
"memory(GiB)": 10.57,
"step": 2370,
"train_speed(iter/s)": 0.474954
},
{
"epoch": 1.774145339062208,
"grad_norm": 0.2139672272846014,
"learning_rate": 4.79659482436608e-06,
"loss": 0.09434814453125,
"memory(GiB)": 10.57,
"step": 2375,
"train_speed(iter/s)": 0.474927
},
{
"epoch": 1.7778815617410797,
"grad_norm": 0.2978805049656468,
"learning_rate": 4.791893961365992e-06,
"loss": 0.11248779296875,
"memory(GiB)": 10.57,
"step": 2380,
"train_speed(iter/s)": 0.474937
},
{
"epoch": 1.7816177844199514,
"grad_norm": 0.20130959752649452,
"learning_rate": 4.787186248402255e-06,
"loss": 0.0978759765625,
"memory(GiB)": 10.57,
"step": 2385,
"train_speed(iter/s)": 0.474949
},
{
"epoch": 1.7853540070988232,
"grad_norm": 0.29180997165297434,
"learning_rate": 4.782471703471281e-06,
"loss": 0.112115478515625,
"memory(GiB)": 10.57,
"step": 2390,
"train_speed(iter/s)": 0.475004
},
{
"epoch": 1.7890902297776947,
"grad_norm": 0.35716522757327235,
"learning_rate": 4.777750344595599e-06,
"loss": 0.111859130859375,
"memory(GiB)": 10.57,
"step": 2395,
"train_speed(iter/s)": 0.475038
},
{
"epoch": 1.7928264524565665,
"grad_norm": 0.20213639606383335,
"learning_rate": 4.773022189823787e-06,
"loss": 0.09229736328125,
"memory(GiB)": 10.57,
"step": 2400,
"train_speed(iter/s)": 0.475057
},
{
"epoch": 1.796562675135438,
"grad_norm": 0.2865105053142085,
"learning_rate": 4.768287257230401e-06,
"loss": 0.097021484375,
"memory(GiB)": 10.57,
"step": 2405,
"train_speed(iter/s)": 0.475109
},
{
"epoch": 1.8002988978143097,
"grad_norm": 0.21308993463861362,
"learning_rate": 4.763545564915908e-06,
"loss": 0.0991943359375,
"memory(GiB)": 10.57,
"step": 2410,
"train_speed(iter/s)": 0.475081
},
{
"epoch": 1.8040351204931815,
"grad_norm": 0.23525035418815923,
"learning_rate": 4.758797131006613e-06,
"loss": 0.0963623046875,
"memory(GiB)": 10.57,
"step": 2415,
"train_speed(iter/s)": 0.475099
},
{
"epoch": 1.807771343172053,
"grad_norm": 0.21883109136220677,
"learning_rate": 4.754041973654596e-06,
"loss": 0.092449951171875,
"memory(GiB)": 10.57,
"step": 2420,
"train_speed(iter/s)": 0.475037
},
{
"epoch": 1.8115075658509248,
"grad_norm": 0.3077520982362397,
"learning_rate": 4.749280111037637e-06,
"loss": 0.113623046875,
"memory(GiB)": 10.57,
"step": 2425,
"train_speed(iter/s)": 0.475075
},
{
"epoch": 1.8152437885297963,
"grad_norm": 0.32425955991836447,
"learning_rate": 4.7445115613591496e-06,
"loss": 0.09962158203125,
"memory(GiB)": 10.57,
"step": 2430,
"train_speed(iter/s)": 0.475116
},
{
"epoch": 1.818980011208668,
"grad_norm": 0.32297534935048733,
"learning_rate": 4.739736342848108e-06,
"loss": 0.09112548828125,
"memory(GiB)": 10.57,
"step": 2435,
"train_speed(iter/s)": 0.475123
},
{
"epoch": 1.8227162338875398,
"grad_norm": 0.21046232051363747,
"learning_rate": 4.734954473758984e-06,
"loss": 0.08634033203125,
"memory(GiB)": 10.57,
"step": 2440,
"train_speed(iter/s)": 0.47511
},
{
"epoch": 1.8264524565664113,
"grad_norm": 0.1757652117500697,
"learning_rate": 4.730165972371668e-06,
"loss": 0.1082275390625,
"memory(GiB)": 10.57,
"step": 2445,
"train_speed(iter/s)": 0.475149
},
{
"epoch": 1.830188679245283,
"grad_norm": 0.25911116090794284,
"learning_rate": 4.725370856991408e-06,
"loss": 0.1029541015625,
"memory(GiB)": 10.57,
"step": 2450,
"train_speed(iter/s)": 0.475184
},
{
"epoch": 1.8339249019241546,
"grad_norm": 0.34390479485101666,
"learning_rate": 4.720569145948732e-06,
"loss": 0.11917724609375,
"memory(GiB)": 10.57,
"step": 2455,
"train_speed(iter/s)": 0.475229
},
{
"epoch": 1.8376611246030263,
"grad_norm": 0.2682881042332428,
"learning_rate": 4.715760857599386e-06,
"loss": 0.09146728515625,
"memory(GiB)": 10.57,
"step": 2460,
"train_speed(iter/s)": 0.475248
},
{
"epoch": 1.841397347281898,
"grad_norm": 0.19430110744207282,
"learning_rate": 4.710946010324257e-06,
"loss": 0.10311279296875,
"memory(GiB)": 10.57,
"step": 2465,
"train_speed(iter/s)": 0.475206
},
{
"epoch": 1.8451335699607698,
"grad_norm": 0.27883436818284973,
"learning_rate": 4.706124622529303e-06,
"loss": 0.10494384765625,
"memory(GiB)": 10.57,
"step": 2470,
"train_speed(iter/s)": 0.475183
},
{
"epoch": 1.8488697926396414,
"grad_norm": 0.31596787268028487,
"learning_rate": 4.7012967126454875e-06,
"loss": 0.08948974609375,
"memory(GiB)": 10.57,
"step": 2475,
"train_speed(iter/s)": 0.47521
},
{
"epoch": 1.8526060153185129,
"grad_norm": 0.31069646386041977,
"learning_rate": 4.696462299128708e-06,
"loss": 0.08408203125,
"memory(GiB)": 10.57,
"step": 2480,
"train_speed(iter/s)": 0.475194
},
{
"epoch": 1.8563422379973846,
"grad_norm": 0.2061030284127865,
"learning_rate": 4.691621400459718e-06,
"loss": 0.09312744140625,
"memory(GiB)": 10.57,
"step": 2485,
"train_speed(iter/s)": 0.475183
},
{
"epoch": 1.8600784606762564,
"grad_norm": 0.2927277785286754,
"learning_rate": 4.686774035144067e-06,
"loss": 0.104736328125,
"memory(GiB)": 10.57,
"step": 2490,
"train_speed(iter/s)": 0.475219
},
{
"epoch": 1.8638146833551281,
"grad_norm": 0.27419348046623093,
"learning_rate": 4.681920221712026e-06,
"loss": 0.10330810546875,
"memory(GiB)": 10.57,
"step": 2495,
"train_speed(iter/s)": 0.475193
},
{
"epoch": 1.8675509060339996,
"grad_norm": 0.2618512568544601,
"learning_rate": 4.67705997871851e-06,
"loss": 0.09486083984375,
"memory(GiB)": 10.57,
"step": 2500,
"train_speed(iter/s)": 0.475193
},
{
"epoch": 1.8712871287128712,
"grad_norm": 0.2616692317535369,
"learning_rate": 4.6721933247430155e-06,
"loss": 0.10108642578125,
"memory(GiB)": 10.57,
"step": 2505,
"train_speed(iter/s)": 0.475234
},
{
"epoch": 1.875023351391743,
"grad_norm": 0.37832147071618105,
"learning_rate": 4.667320278389548e-06,
"loss": 0.094085693359375,
"memory(GiB)": 10.57,
"step": 2510,
"train_speed(iter/s)": 0.475221
},
{
"epoch": 1.8787595740706147,
"grad_norm": 0.24687088782500174,
"learning_rate": 4.662440858286548e-06,
"loss": 0.09676513671875,
"memory(GiB)": 10.57,
"step": 2515,
"train_speed(iter/s)": 0.475216
},
{
"epoch": 1.8824957967494864,
"grad_norm": 0.234016616688346,
"learning_rate": 4.657555083086823e-06,
"loss": 0.10130615234375,
"memory(GiB)": 10.57,
"step": 2520,
"train_speed(iter/s)": 0.475251
},
{
"epoch": 1.886232019428358,
"grad_norm": 0.238817474808307,
"learning_rate": 4.65266297146747e-06,
"loss": 0.097900390625,
"memory(GiB)": 10.57,
"step": 2525,
"train_speed(iter/s)": 0.475255
},
{
"epoch": 1.8899682421072295,
"grad_norm": 0.207645191573174,
"learning_rate": 4.647764542129812e-06,
"loss": 0.091064453125,
"memory(GiB)": 10.57,
"step": 2530,
"train_speed(iter/s)": 0.475271
},
{
"epoch": 1.8937044647861012,
"grad_norm": 0.38113365892750667,
"learning_rate": 4.642859813799324e-06,
"loss": 0.118853759765625,
"memory(GiB)": 10.57,
"step": 2535,
"train_speed(iter/s)": 0.475293
},
{
"epoch": 1.897440687464973,
"grad_norm": 0.19816679538437149,
"learning_rate": 4.637948805225559e-06,
"loss": 0.08568115234375,
"memory(GiB)": 10.57,
"step": 2540,
"train_speed(iter/s)": 0.475228
},
{
"epoch": 1.9011769101438447,
"grad_norm": 0.23604249041392467,
"learning_rate": 4.633031535182075e-06,
"loss": 0.11710205078125,
"memory(GiB)": 10.57,
"step": 2545,
"train_speed(iter/s)": 0.47526
},
{
"epoch": 1.9049131328227162,
"grad_norm": 0.24670385102759632,
"learning_rate": 4.6281080224663716e-06,
"loss": 0.087890625,
"memory(GiB)": 10.57,
"step": 2550,
"train_speed(iter/s)": 0.475273
},
{
"epoch": 1.9086493555015878,
"grad_norm": 0.2847144171201072,
"learning_rate": 4.62317828589981e-06,
"loss": 0.104248046875,
"memory(GiB)": 10.57,
"step": 2555,
"train_speed(iter/s)": 0.475257
},
{
"epoch": 1.9123855781804595,
"grad_norm": 0.3178684000074,
"learning_rate": 4.618242344327542e-06,
"loss": 0.0997802734375,
"memory(GiB)": 10.57,
"step": 2560,
"train_speed(iter/s)": 0.475268
},
{
"epoch": 1.9161218008593313,
"grad_norm": 0.2554865843964831,
"learning_rate": 4.613300216618441e-06,
"loss": 0.097015380859375,
"memory(GiB)": 10.57,
"step": 2565,
"train_speed(iter/s)": 0.475299
},
{
"epoch": 1.919858023538203,
"grad_norm": 0.2965767135219661,
"learning_rate": 4.608351921665029e-06,
"loss": 0.10614013671875,
"memory(GiB)": 10.57,
"step": 2570,
"train_speed(iter/s)": 0.475332
},
{
"epoch": 1.9235942462170745,
"grad_norm": 0.4039822442089598,
"learning_rate": 4.603397478383403e-06,
"loss": 0.10904541015625,
"memory(GiB)": 10.57,
"step": 2575,
"train_speed(iter/s)": 0.475287
},
{
"epoch": 1.927330468895946,
"grad_norm": 0.25628472854278145,
"learning_rate": 4.5984369057131656e-06,
"loss": 0.0983642578125,
"memory(GiB)": 10.57,
"step": 2580,
"train_speed(iter/s)": 0.475305
},
{
"epoch": 1.9310666915748178,
"grad_norm": 0.2779068338896975,
"learning_rate": 4.5934702226173455e-06,
"loss": 0.098095703125,
"memory(GiB)": 10.57,
"step": 2585,
"train_speed(iter/s)": 0.475271
},
{
"epoch": 1.9348029142536896,
"grad_norm": 0.281249239607163,
"learning_rate": 4.588497448082336e-06,
"loss": 0.129345703125,
"memory(GiB)": 10.57,
"step": 2590,
"train_speed(iter/s)": 0.475263
},
{
"epoch": 1.9385391369325613,
"grad_norm": 0.18136865279150907,
"learning_rate": 4.583518601117812e-06,
"loss": 0.089013671875,
"memory(GiB)": 10.57,
"step": 2595,
"train_speed(iter/s)": 0.47529
},
{
"epoch": 1.9422753596114328,
"grad_norm": 0.3240659543460739,
"learning_rate": 4.578533700756666e-06,
"loss": 0.11053466796875,
"memory(GiB)": 10.57,
"step": 2600,
"train_speed(iter/s)": 0.475327
},
{
"epoch": 1.9460115822903044,
"grad_norm": 0.19903277137682823,
"learning_rate": 4.573542766054926e-06,
"loss": 0.1120361328125,
"memory(GiB)": 10.57,
"step": 2605,
"train_speed(iter/s)": 0.475344
},
{
"epoch": 1.949747804969176,
"grad_norm": 0.24138123028972722,
"learning_rate": 4.568545816091691e-06,
"loss": 0.08602294921875,
"memory(GiB)": 10.57,
"step": 2610,
"train_speed(iter/s)": 0.475337
},
{
"epoch": 1.9534840276480478,
"grad_norm": 0.28322280343269146,
"learning_rate": 4.563542869969055e-06,
"loss": 0.08720703125,
"memory(GiB)": 10.57,
"step": 2615,
"train_speed(iter/s)": 0.475316
},
{
"epoch": 1.9572202503269196,
"grad_norm": 0.249240836739657,
"learning_rate": 4.558533946812034e-06,
"loss": 0.093548583984375,
"memory(GiB)": 10.57,
"step": 2620,
"train_speed(iter/s)": 0.475334
},
{
"epoch": 1.9609564730057911,
"grad_norm": 0.26762802652785167,
"learning_rate": 4.55351906576849e-06,
"loss": 0.08345947265625,
"memory(GiB)": 10.57,
"step": 2625,
"train_speed(iter/s)": 0.475361
},
{
"epoch": 1.9646926956846626,
"grad_norm": 0.22273584638151617,
"learning_rate": 4.548498246009062e-06,
"loss": 0.10457763671875,
"memory(GiB)": 10.57,
"step": 2630,
"train_speed(iter/s)": 0.475364
},
{
"epoch": 1.9684289183635344,
"grad_norm": 0.3304879364377937,
"learning_rate": 4.543471506727094e-06,
"loss": 0.1021240234375,
"memory(GiB)": 10.57,
"step": 2635,
"train_speed(iter/s)": 0.475354
},
{
"epoch": 1.9721651410424061,
"grad_norm": 0.29863906262334294,
"learning_rate": 4.538438867138554e-06,
"loss": 0.10843505859375,
"memory(GiB)": 10.57,
"step": 2640,
"train_speed(iter/s)": 0.475332
},
{
"epoch": 1.975901363721278,
"grad_norm": 0.2714963446386557,
"learning_rate": 4.533400346481969e-06,
"loss": 0.097955322265625,
"memory(GiB)": 10.57,
"step": 2645,
"train_speed(iter/s)": 0.475329
},
{
"epoch": 1.9796375864001494,
"grad_norm": 0.3336618360843215,
"learning_rate": 4.528355964018347e-06,
"loss": 0.09144287109375,
"memory(GiB)": 10.57,
"step": 2650,
"train_speed(iter/s)": 0.475305
},
{
"epoch": 1.983373809079021,
"grad_norm": 0.2980584550792422,
"learning_rate": 4.523305739031104e-06,
"loss": 0.0895965576171875,
"memory(GiB)": 10.57,
"step": 2655,
"train_speed(iter/s)": 0.475329
},
{
"epoch": 1.9871100317578927,
"grad_norm": 0.2720629310615164,
"learning_rate": 4.518249690825988e-06,
"loss": 0.1112548828125,
"memory(GiB)": 10.57,
"step": 2660,
"train_speed(iter/s)": 0.475346
},
{
"epoch": 1.9908462544367644,
"grad_norm": 0.3546253789825318,
"learning_rate": 4.5131878387310135e-06,
"loss": 0.12337646484375,
"memory(GiB)": 10.57,
"step": 2665,
"train_speed(iter/s)": 0.475357
},
{
"epoch": 1.9945824771156362,
"grad_norm": 0.28424801518849385,
"learning_rate": 4.508120202096376e-06,
"loss": 0.109814453125,
"memory(GiB)": 10.57,
"step": 2670,
"train_speed(iter/s)": 0.475325
},
{
"epoch": 1.9983186997945077,
"grad_norm": 0.21729292843099146,
"learning_rate": 4.5030468002943874e-06,
"loss": 0.0903076171875,
"memory(GiB)": 10.57,
"step": 2675,
"train_speed(iter/s)": 0.475307
},
{
"epoch": 2.0014944890715487,
"grad_norm": 0.2345088903539215,
"learning_rate": 4.497967652719397e-06,
"loss": 0.08399658203125,
"memory(GiB)": 10.57,
"step": 2680,
"train_speed(iter/s)": 0.475181
},
{
"epoch": 2.0052307117504204,
"grad_norm": 0.15184847590072537,
"learning_rate": 4.492882778787718e-06,
"loss": 0.07313232421875,
"memory(GiB)": 10.57,
"step": 2685,
"train_speed(iter/s)": 0.475183
},
{
"epoch": 2.008966934429292,
"grad_norm": 0.19979306629529392,
"learning_rate": 4.487792197937558e-06,
"loss": 0.0822509765625,
"memory(GiB)": 10.57,
"step": 2690,
"train_speed(iter/s)": 0.475199
},
{
"epoch": 2.0127031571081635,
"grad_norm": 0.2757404700372733,
"learning_rate": 4.482695929628936e-06,
"loss": 0.083453369140625,
"memory(GiB)": 10.57,
"step": 2695,
"train_speed(iter/s)": 0.475215
},
{
"epoch": 2.0164393797870352,
"grad_norm": 0.2560396040817178,
"learning_rate": 4.477593993343614e-06,
"loss": 0.0873291015625,
"memory(GiB)": 10.57,
"step": 2700,
"train_speed(iter/s)": 0.475205
},
{
"epoch": 2.020175602465907,
"grad_norm": 0.26086772363802274,
"learning_rate": 4.472486408585022e-06,
"loss": 0.084521484375,
"memory(GiB)": 10.57,
"step": 2705,
"train_speed(iter/s)": 0.475236
},
{
"epoch": 2.0239118251447787,
"grad_norm": 0.2694766103158065,
"learning_rate": 4.467373194878183e-06,
"loss": 0.0845458984375,
"memory(GiB)": 10.57,
"step": 2710,
"train_speed(iter/s)": 0.475254
},
{
"epoch": 2.0276480478236505,
"grad_norm": 0.36339657819849375,
"learning_rate": 4.462254371769637e-06,
"loss": 0.08817138671875,
"memory(GiB)": 10.57,
"step": 2715,
"train_speed(iter/s)": 0.475273
},
{
"epoch": 2.031384270502522,
"grad_norm": 0.1574529728668933,
"learning_rate": 4.457129958827369e-06,
"loss": 0.07781982421875,
"memory(GiB)": 10.57,
"step": 2720,
"train_speed(iter/s)": 0.475304
},
{
"epoch": 2.0351204931813935,
"grad_norm": 0.2776966602079697,
"learning_rate": 4.451999975640731e-06,
"loss": 0.079388427734375,
"memory(GiB)": 10.57,
"step": 2725,
"train_speed(iter/s)": 0.47531
},
{
"epoch": 2.0388567158602653,
"grad_norm": 0.1584963725015156,
"learning_rate": 4.446864441820368e-06,
"loss": 0.09000244140625,
"memory(GiB)": 10.57,
"step": 2730,
"train_speed(iter/s)": 0.475323
},
{
"epoch": 2.042592938539137,
"grad_norm": 0.22327421262837086,
"learning_rate": 4.441723376998147e-06,
"loss": 0.0762939453125,
"memory(GiB)": 10.57,
"step": 2735,
"train_speed(iter/s)": 0.475273
},
{
"epoch": 2.046329161218009,
"grad_norm": 0.33057687684074827,
"learning_rate": 4.436576800827074e-06,
"loss": 0.06875762939453126,
"memory(GiB)": 10.57,
"step": 2740,
"train_speed(iter/s)": 0.475284
},
{
"epoch": 2.05006538389688,
"grad_norm": 0.29923227392853685,
"learning_rate": 4.431424732981228e-06,
"loss": 0.06706466674804687,
"memory(GiB)": 10.57,
"step": 2745,
"train_speed(iter/s)": 0.475292
},
{
"epoch": 2.053801606575752,
"grad_norm": 0.4030927309740962,
"learning_rate": 4.426267193155678e-06,
"loss": 0.075927734375,
"memory(GiB)": 10.57,
"step": 2750,
"train_speed(iter/s)": 0.475316
},
{
"epoch": 2.0575378292546236,
"grad_norm": 0.37117244198948085,
"learning_rate": 4.4211042010664135e-06,
"loss": 0.07960205078125,
"memory(GiB)": 10.57,
"step": 2755,
"train_speed(iter/s)": 0.475314
},
{
"epoch": 2.0612740519334953,
"grad_norm": 0.31391095462008983,
"learning_rate": 4.415935776450264e-06,
"loss": 0.09554443359375,
"memory(GiB)": 10.57,
"step": 2760,
"train_speed(iter/s)": 0.475317
},
{
"epoch": 2.065010274612367,
"grad_norm": 0.17975702587106152,
"learning_rate": 4.410761939064827e-06,
"loss": 0.07388916015625,
"memory(GiB)": 10.57,
"step": 2765,
"train_speed(iter/s)": 0.475337
},
{
"epoch": 2.0687464972912384,
"grad_norm": 0.3396889402601098,
"learning_rate": 4.405582708688395e-06,
"loss": 0.084979248046875,
"memory(GiB)": 10.57,
"step": 2770,
"train_speed(iter/s)": 0.475352
},
{
"epoch": 2.07248271997011,
"grad_norm": 0.24563175886180283,
"learning_rate": 4.400398105119872e-06,
"loss": 0.08388671875,
"memory(GiB)": 10.57,
"step": 2775,
"train_speed(iter/s)": 0.475388
},
{
"epoch": 2.076218942648982,
"grad_norm": 0.2558763668832394,
"learning_rate": 4.395208148178704e-06,
"loss": 0.0897216796875,
"memory(GiB)": 10.57,
"step": 2780,
"train_speed(iter/s)": 0.475404
},
{
"epoch": 2.0799551653278536,
"grad_norm": 0.3548268406619161,
"learning_rate": 4.390012857704802e-06,
"loss": 0.08565673828125,
"memory(GiB)": 10.57,
"step": 2785,
"train_speed(iter/s)": 0.4754
},
{
"epoch": 2.0836913880067254,
"grad_norm": 0.326064743718348,
"learning_rate": 4.384812253558467e-06,
"loss": 0.08856201171875,
"memory(GiB)": 10.57,
"step": 2790,
"train_speed(iter/s)": 0.47541
},
{
"epoch": 2.0874276106855967,
"grad_norm": 0.3250783826701612,
"learning_rate": 4.37960635562031e-06,
"loss": 0.083563232421875,
"memory(GiB)": 10.57,
"step": 2795,
"train_speed(iter/s)": 0.475407
},
{
"epoch": 2.0911638333644684,
"grad_norm": 0.1928343549830928,
"learning_rate": 4.3743951837911804e-06,
"loss": 0.0770751953125,
"memory(GiB)": 10.57,
"step": 2800,
"train_speed(iter/s)": 0.475418
},
{
"epoch": 2.09490005604334,
"grad_norm": 0.3314940438350291,
"learning_rate": 4.3691787579920886e-06,
"loss": 0.0668182373046875,
"memory(GiB)": 10.57,
"step": 2805,
"train_speed(iter/s)": 0.475443
},
{
"epoch": 2.098636278722212,
"grad_norm": 0.25557946764887945,
"learning_rate": 4.363957098164129e-06,
"loss": 0.09249267578125,
"memory(GiB)": 10.57,
"step": 2810,
"train_speed(iter/s)": 0.475472
},
{
"epoch": 2.1023725014010837,
"grad_norm": 0.2834236723948582,
"learning_rate": 4.358730224268404e-06,
"loss": 0.076348876953125,
"memory(GiB)": 10.57,
"step": 2815,
"train_speed(iter/s)": 0.475457
},
{
"epoch": 2.106108724079955,
"grad_norm": 0.17913726646319922,
"learning_rate": 4.353498156285951e-06,
"loss": 0.0684478759765625,
"memory(GiB)": 10.57,
"step": 2820,
"train_speed(iter/s)": 0.475474
},
{
"epoch": 2.1098449467588267,
"grad_norm": 0.39181628904806004,
"learning_rate": 4.3482609142176585e-06,
"loss": 0.08323974609375,
"memory(GiB)": 10.57,
"step": 2825,
"train_speed(iter/s)": 0.475472
},
{
"epoch": 2.1135811694376985,
"grad_norm": 0.3689042584118628,
"learning_rate": 4.343018518084197e-06,
"loss": 0.08089599609375,
"memory(GiB)": 10.57,
"step": 2830,
"train_speed(iter/s)": 0.475507
},
{
"epoch": 2.11731739211657,
"grad_norm": 0.30944635697905426,
"learning_rate": 4.337770987925941e-06,
"loss": 0.074566650390625,
"memory(GiB)": 10.57,
"step": 2835,
"train_speed(iter/s)": 0.475485
},
{
"epoch": 2.121053614795442,
"grad_norm": 0.20965343005966453,
"learning_rate": 4.332518343802886e-06,
"loss": 0.0746063232421875,
"memory(GiB)": 10.57,
"step": 2840,
"train_speed(iter/s)": 0.475453
},
{
"epoch": 2.1247898374743133,
"grad_norm": 0.24055286896299563,
"learning_rate": 4.327260605794583e-06,
"loss": 0.0832275390625,
"memory(GiB)": 10.57,
"step": 2845,
"train_speed(iter/s)": 0.475488
},
{
"epoch": 2.128526060153185,
"grad_norm": 0.30278392143378924,
"learning_rate": 4.321997794000053e-06,
"loss": 0.09150390625,
"memory(GiB)": 10.57,
"step": 2850,
"train_speed(iter/s)": 0.475506
},
{
"epoch": 2.1322622828320568,
"grad_norm": 0.3357493665071166,
"learning_rate": 4.316729928537712e-06,
"loss": 0.077264404296875,
"memory(GiB)": 10.57,
"step": 2855,
"train_speed(iter/s)": 0.475505
},
{
"epoch": 2.1359985055109285,
"grad_norm": 0.28839246476160085,
"learning_rate": 4.311457029545295e-06,
"loss": 0.07557373046875,
"memory(GiB)": 10.57,
"step": 2860,
"train_speed(iter/s)": 0.475494
},
{
"epoch": 2.1397347281898003,
"grad_norm": 0.3587645451871882,
"learning_rate": 4.30617911717978e-06,
"loss": 0.08240966796875,
"memory(GiB)": 10.57,
"step": 2865,
"train_speed(iter/s)": 0.475522
},
{
"epoch": 2.1434709508686716,
"grad_norm": 0.21348435074986552,
"learning_rate": 4.3008962116173105e-06,
"loss": 0.06397705078125,
"memory(GiB)": 10.57,
"step": 2870,
"train_speed(iter/s)": 0.47546
},
{
"epoch": 2.1472071735475433,
"grad_norm": 0.24044644726569717,
"learning_rate": 4.295608333053115e-06,
"loss": 0.0892333984375,
"memory(GiB)": 10.57,
"step": 2875,
"train_speed(iter/s)": 0.475493
},
{
"epoch": 2.150943396226415,
"grad_norm": 0.271844882428932,
"learning_rate": 4.290315501701436e-06,
"loss": 0.07017822265625,
"memory(GiB)": 10.57,
"step": 2880,
"train_speed(iter/s)": 0.475506
},
{
"epoch": 2.154679618905287,
"grad_norm": 0.32275562789715756,
"learning_rate": 4.285017737795447e-06,
"loss": 0.094970703125,
"memory(GiB)": 10.57,
"step": 2885,
"train_speed(iter/s)": 0.475543
},
{
"epoch": 2.1584158415841586,
"grad_norm": 0.19204227011392838,
"learning_rate": 4.279715061587176e-06,
"loss": 0.082275390625,
"memory(GiB)": 10.57,
"step": 2890,
"train_speed(iter/s)": 0.475515
},
{
"epoch": 2.1621520642630303,
"grad_norm": 0.3187374981569435,
"learning_rate": 4.274407493347435e-06,
"loss": 0.073956298828125,
"memory(GiB)": 10.57,
"step": 2895,
"train_speed(iter/s)": 0.475498
},
{
"epoch": 2.1658882869419016,
"grad_norm": 0.31518550432451825,
"learning_rate": 4.26909505336573e-06,
"loss": 0.08779296875,
"memory(GiB)": 10.57,
"step": 2900,
"train_speed(iter/s)": 0.475501
},
{
"epoch": 2.1696245096207734,
"grad_norm": 0.25742777623976215,
"learning_rate": 4.2637777619501955e-06,
"loss": 0.068133544921875,
"memory(GiB)": 10.57,
"step": 2905,
"train_speed(iter/s)": 0.475526
},
{
"epoch": 2.173360732299645,
"grad_norm": 0.327461904975564,
"learning_rate": 4.258455639427512e-06,
"loss": 0.07855224609375,
"memory(GiB)": 10.57,
"step": 2910,
"train_speed(iter/s)": 0.475516
},
{
"epoch": 2.177096954978517,
"grad_norm": 0.2947045587842032,
"learning_rate": 4.253128706142823e-06,
"loss": 0.078759765625,
"memory(GiB)": 10.57,
"step": 2915,
"train_speed(iter/s)": 0.475556
},
{
"epoch": 2.180833177657388,
"grad_norm": 0.24106474434323896,
"learning_rate": 4.2477969824596675e-06,
"loss": 0.0806396484375,
"memory(GiB)": 10.57,
"step": 2920,
"train_speed(iter/s)": 0.475576
},
{
"epoch": 2.18456940033626,
"grad_norm": 0.35498053988232225,
"learning_rate": 4.2424604887598956e-06,
"loss": 0.08232421875,
"memory(GiB)": 10.57,
"step": 2925,
"train_speed(iter/s)": 0.475536
},
{
"epoch": 2.1883056230151317,
"grad_norm": 0.30444021185040904,
"learning_rate": 4.237119245443591e-06,
"loss": 0.08363037109375,
"memory(GiB)": 10.57,
"step": 2930,
"train_speed(iter/s)": 0.475537
},
{
"epoch": 2.1920418456940034,
"grad_norm": 0.2844894921351017,
"learning_rate": 4.231773272928995e-06,
"loss": 0.0828857421875,
"memory(GiB)": 10.57,
"step": 2935,
"train_speed(iter/s)": 0.475519
},
{
"epoch": 2.195778068372875,
"grad_norm": 0.3680515586014792,
"learning_rate": 4.226422591652426e-06,
"loss": 0.0849609375,
"memory(GiB)": 10.57,
"step": 2940,
"train_speed(iter/s)": 0.475527
},
{
"epoch": 2.199514291051747,
"grad_norm": 0.3347584264458827,
"learning_rate": 4.221067222068204e-06,
"loss": 0.07615966796875,
"memory(GiB)": 10.57,
"step": 2945,
"train_speed(iter/s)": 0.475506
},
{
"epoch": 2.203250513730618,
"grad_norm": 0.24357214909557126,
"learning_rate": 4.215707184648571e-06,
"loss": 0.071929931640625,
"memory(GiB)": 10.57,
"step": 2950,
"train_speed(iter/s)": 0.475535
},
{
"epoch": 2.20698673640949,
"grad_norm": 0.2969870033632324,
"learning_rate": 4.2103424998836166e-06,
"loss": 0.0795166015625,
"memory(GiB)": 10.57,
"step": 2955,
"train_speed(iter/s)": 0.475539
},
{
"epoch": 2.2107229590883617,
"grad_norm": 0.2597821857641748,
"learning_rate": 4.204973188281187e-06,
"loss": 0.078076171875,
"memory(GiB)": 10.57,
"step": 2960,
"train_speed(iter/s)": 0.475554
},
{
"epoch": 2.2144591817672334,
"grad_norm": 0.345560787249567,
"learning_rate": 4.199599270366825e-06,
"loss": 0.085748291015625,
"memory(GiB)": 10.57,
"step": 2965,
"train_speed(iter/s)": 0.47555
},
{
"epoch": 2.218195404446105,
"grad_norm": 0.30970032428526245,
"learning_rate": 4.1942207666836765e-06,
"loss": 0.082818603515625,
"memory(GiB)": 10.57,
"step": 2970,
"train_speed(iter/s)": 0.475506
},
{
"epoch": 2.2219316271249765,
"grad_norm": 0.3183590391694136,
"learning_rate": 4.188837697792421e-06,
"loss": 0.0791748046875,
"memory(GiB)": 10.57,
"step": 2975,
"train_speed(iter/s)": 0.475502
},
{
"epoch": 2.2256678498038482,
"grad_norm": 0.40743149107649224,
"learning_rate": 4.183450084271186e-06,
"loss": 0.085736083984375,
"memory(GiB)": 10.57,
"step": 2980,
"train_speed(iter/s)": 0.475528
},
{
"epoch": 2.22940407248272,
"grad_norm": 0.36574069885687205,
"learning_rate": 4.178057946715476e-06,
"loss": 0.08839111328125,
"memory(GiB)": 10.57,
"step": 2985,
"train_speed(iter/s)": 0.475523
},
{
"epoch": 2.2331402951615917,
"grad_norm": 0.29949255358893273,
"learning_rate": 4.172661305738086e-06,
"loss": 0.076226806640625,
"memory(GiB)": 10.57,
"step": 2990,
"train_speed(iter/s)": 0.4755
},
{
"epoch": 2.2368765178404635,
"grad_norm": 0.2645783347146312,
"learning_rate": 4.167260181969031e-06,
"loss": 0.0787109375,
"memory(GiB)": 10.57,
"step": 2995,
"train_speed(iter/s)": 0.475505
},
{
"epoch": 2.240612740519335,
"grad_norm": 0.35590583986728974,
"learning_rate": 4.161854596055458e-06,
"loss": 0.082958984375,
"memory(GiB)": 10.57,
"step": 3000,
"train_speed(iter/s)": 0.475522
},
{
"epoch": 2.2443489631982065,
"grad_norm": 0.2855462271704881,
"learning_rate": 4.156444568661574e-06,
"loss": 0.0782135009765625,
"memory(GiB)": 10.57,
"step": 3005,
"train_speed(iter/s)": 0.475556
},
{
"epoch": 2.2480851858770783,
"grad_norm": 0.23189643301309532,
"learning_rate": 4.151030120468563e-06,
"loss": 0.08284912109375,
"memory(GiB)": 10.57,
"step": 3010,
"train_speed(iter/s)": 0.475525
},
{
"epoch": 2.25182140855595,
"grad_norm": 0.2823549603550444,
"learning_rate": 4.145611272174513e-06,
"loss": 0.1001220703125,
"memory(GiB)": 10.57,
"step": 3015,
"train_speed(iter/s)": 0.47551
},
{
"epoch": 2.255557631234822,
"grad_norm": 0.3123102072825862,
"learning_rate": 4.140188044494328e-06,
"loss": 0.0789306640625,
"memory(GiB)": 10.57,
"step": 3020,
"train_speed(iter/s)": 0.475473
},
{
"epoch": 2.259293853913693,
"grad_norm": 0.34390166190396304,
"learning_rate": 4.134760458159652e-06,
"loss": 0.088250732421875,
"memory(GiB)": 10.57,
"step": 3025,
"train_speed(iter/s)": 0.475474
},
{
"epoch": 2.263030076592565,
"grad_norm": 0.4471657878322189,
"learning_rate": 4.1293285339187975e-06,
"loss": 0.08520050048828125,
"memory(GiB)": 10.57,
"step": 3030,
"train_speed(iter/s)": 0.475502
},
{
"epoch": 2.2667662992714366,
"grad_norm": 0.29627009892222517,
"learning_rate": 4.123892292536655e-06,
"loss": 0.0954498291015625,
"memory(GiB)": 10.57,
"step": 3035,
"train_speed(iter/s)": 0.475527
},
{
"epoch": 2.2705025219503083,
"grad_norm": 0.2103142983370086,
"learning_rate": 4.118451754794616e-06,
"loss": 0.079296875,
"memory(GiB)": 10.57,
"step": 3040,
"train_speed(iter/s)": 0.47555
},
{
"epoch": 2.27423874462918,
"grad_norm": 0.29094874204231086,
"learning_rate": 4.113006941490504e-06,
"loss": 0.07890625,
"memory(GiB)": 10.57,
"step": 3045,
"train_speed(iter/s)": 0.475543
},
{
"epoch": 2.2779749673080514,
"grad_norm": 0.2944500502582637,
"learning_rate": 4.1075578734384796e-06,
"loss": 0.07510986328125,
"memory(GiB)": 10.57,
"step": 3050,
"train_speed(iter/s)": 0.475485
},
{
"epoch": 2.281711189986923,
"grad_norm": 0.247526345569416,
"learning_rate": 4.1021045714689715e-06,
"loss": 0.062725830078125,
"memory(GiB)": 10.57,
"step": 3055,
"train_speed(iter/s)": 0.4755
},
{
"epoch": 2.285447412665795,
"grad_norm": 0.2223509166017715,
"learning_rate": 4.096647056428591e-06,
"loss": 0.08511962890625,
"memory(GiB)": 10.57,
"step": 3060,
"train_speed(iter/s)": 0.475511
},
{
"epoch": 2.2891836353446666,
"grad_norm": 0.40394852915768165,
"learning_rate": 4.0911853491800606e-06,
"loss": 0.078338623046875,
"memory(GiB)": 10.57,
"step": 3065,
"train_speed(iter/s)": 0.475523
},
{
"epoch": 2.2929198580235384,
"grad_norm": 0.3262435355040092,
"learning_rate": 4.085719470602121e-06,
"loss": 0.085260009765625,
"memory(GiB)": 10.57,
"step": 3070,
"train_speed(iter/s)": 0.475506
},
{
"epoch": 2.2966560807024097,
"grad_norm": 0.30731468388186667,
"learning_rate": 4.080249441589465e-06,
"loss": 0.081439208984375,
"memory(GiB)": 10.57,
"step": 3075,
"train_speed(iter/s)": 0.47553
},
{
"epoch": 2.3003923033812814,
"grad_norm": 0.2619319232654712,
"learning_rate": 4.074775283052647e-06,
"loss": 0.07823486328125,
"memory(GiB)": 10.57,
"step": 3080,
"train_speed(iter/s)": 0.475536
},
{
"epoch": 2.304128526060153,
"grad_norm": 0.28997697963247854,
"learning_rate": 4.069297015918012e-06,
"loss": 0.080047607421875,
"memory(GiB)": 10.57,
"step": 3085,
"train_speed(iter/s)": 0.475543
},
{
"epoch": 2.307864748739025,
"grad_norm": 0.3041055152853103,
"learning_rate": 4.063814661127607e-06,
"loss": 0.085015869140625,
"memory(GiB)": 10.57,
"step": 3090,
"train_speed(iter/s)": 0.475538
},
{
"epoch": 2.3116009714178967,
"grad_norm": 0.28074738714998865,
"learning_rate": 4.058328239639108e-06,
"loss": 0.0771240234375,
"memory(GiB)": 10.57,
"step": 3095,
"train_speed(iter/s)": 0.475537
},
{
"epoch": 2.3153371940967684,
"grad_norm": 0.2742208472612064,
"learning_rate": 4.052837772425735e-06,
"loss": 0.071533203125,
"memory(GiB)": 10.57,
"step": 3100,
"train_speed(iter/s)": 0.475526
},
{
"epoch": 2.3190734167756397,
"grad_norm": 0.2738394747920133,
"learning_rate": 4.0473432804761745e-06,
"loss": 0.074151611328125,
"memory(GiB)": 10.57,
"step": 3105,
"train_speed(iter/s)": 0.475533
},
{
"epoch": 2.3228096394545115,
"grad_norm": 0.3325363093754662,
"learning_rate": 4.0418447847945e-06,
"loss": 0.07762451171875,
"memory(GiB)": 10.57,
"step": 3110,
"train_speed(iter/s)": 0.475573
},
{
"epoch": 2.326545862133383,
"grad_norm": 0.29208910041820724,
"learning_rate": 4.036342306400087e-06,
"loss": 0.08729248046875,
"memory(GiB)": 10.57,
"step": 3115,
"train_speed(iter/s)": 0.475557
},
{
"epoch": 2.330282084812255,
"grad_norm": 0.2986291580987787,
"learning_rate": 4.03083586632754e-06,
"loss": 0.070965576171875,
"memory(GiB)": 10.57,
"step": 3120,
"train_speed(iter/s)": 0.475591
},
{
"epoch": 2.3340183074911263,
"grad_norm": 0.2715172245264193,
"learning_rate": 4.025325485626604e-06,
"loss": 0.07711181640625,
"memory(GiB)": 10.57,
"step": 3125,
"train_speed(iter/s)": 0.475607
},
{
"epoch": 2.337754530169998,
"grad_norm": 0.28383527690267557,
"learning_rate": 4.01981118536209e-06,
"loss": 0.073974609375,
"memory(GiB)": 10.57,
"step": 3130,
"train_speed(iter/s)": 0.475608
},
{
"epoch": 2.3414907528488698,
"grad_norm": 0.4294056030563819,
"learning_rate": 4.014292986613795e-06,
"loss": 0.09591064453125,
"memory(GiB)": 10.57,
"step": 3135,
"train_speed(iter/s)": 0.475616
},
{
"epoch": 2.3452269755277415,
"grad_norm": 0.324672085272647,
"learning_rate": 4.008770910476415e-06,
"loss": 0.073956298828125,
"memory(GiB)": 10.57,
"step": 3140,
"train_speed(iter/s)": 0.475642
},
{
"epoch": 2.3489631982066133,
"grad_norm": 0.33039068217728207,
"learning_rate": 4.003244978059466e-06,
"loss": 0.082257080078125,
"memory(GiB)": 10.57,
"step": 3145,
"train_speed(iter/s)": 0.475644
},
{
"epoch": 2.352699420885485,
"grad_norm": 0.25727097167399077,
"learning_rate": 3.997715210487215e-06,
"loss": 0.078131103515625,
"memory(GiB)": 10.57,
"step": 3150,
"train_speed(iter/s)": 0.475682
},
{
"epoch": 2.3564356435643563,
"grad_norm": 0.3005461408551253,
"learning_rate": 3.992181628898582e-06,
"loss": 0.0718292236328125,
"memory(GiB)": 10.57,
"step": 3155,
"train_speed(iter/s)": 0.475677
},
{
"epoch": 2.360171866243228,
"grad_norm": 0.21717097651290396,
"learning_rate": 3.986644254447067e-06,
"loss": 0.084930419921875,
"memory(GiB)": 10.57,
"step": 3160,
"train_speed(iter/s)": 0.475668
},
{
"epoch": 2.3639080889221,
"grad_norm": 0.2740183483391346,
"learning_rate": 3.981103108300674e-06,
"loss": 0.08662109375,
"memory(GiB)": 10.57,
"step": 3165,
"train_speed(iter/s)": 0.475671
},
{
"epoch": 2.3676443116009716,
"grad_norm": 0.23952800833281973,
"learning_rate": 3.975558211641822e-06,
"loss": 0.085614013671875,
"memory(GiB)": 10.57,
"step": 3170,
"train_speed(iter/s)": 0.475681
},
{
"epoch": 2.371380534279843,
"grad_norm": 0.20740773834062282,
"learning_rate": 3.970009585667267e-06,
"loss": 0.0666015625,
"memory(GiB)": 10.57,
"step": 3175,
"train_speed(iter/s)": 0.475702
},
{
"epoch": 2.3751167569587146,
"grad_norm": 0.3093587039146876,
"learning_rate": 3.964457251588023e-06,
"loss": 0.07269287109375,
"memory(GiB)": 10.57,
"step": 3180,
"train_speed(iter/s)": 0.475703
},
{
"epoch": 2.3788529796375864,
"grad_norm": 0.3535470284455733,
"learning_rate": 3.958901230629277e-06,
"loss": 0.0844482421875,
"memory(GiB)": 10.57,
"step": 3185,
"train_speed(iter/s)": 0.475708
},
{
"epoch": 2.382589202316458,
"grad_norm": 0.3279555931100402,
"learning_rate": 3.953341544030311e-06,
"loss": 0.08740234375,
"memory(GiB)": 10.57,
"step": 3190,
"train_speed(iter/s)": 0.475712
},
{
"epoch": 2.38632542499533,
"grad_norm": 0.37799827875806785,
"learning_rate": 3.947778213044423e-06,
"loss": 0.06464996337890624,
"memory(GiB)": 10.57,
"step": 3195,
"train_speed(iter/s)": 0.475685
},
{
"epoch": 2.3900616476742016,
"grad_norm": 0.21175755993638834,
"learning_rate": 3.942211258938837e-06,
"loss": 0.079998779296875,
"memory(GiB)": 10.57,
"step": 3200,
"train_speed(iter/s)": 0.475655
},
{
"epoch": 2.393797870353073,
"grad_norm": 0.3983514672863944,
"learning_rate": 3.936640702994629e-06,
"loss": 0.07978515625,
"memory(GiB)": 10.57,
"step": 3205,
"train_speed(iter/s)": 0.475627
},
{
"epoch": 2.3975340930319446,
"grad_norm": 0.3407681935903124,
"learning_rate": 3.931066566506648e-06,
"loss": 0.08079833984375,
"memory(GiB)": 10.57,
"step": 3210,
"train_speed(iter/s)": 0.475614
},
{
"epoch": 2.4012703157108164,
"grad_norm": 0.1829141400287362,
"learning_rate": 3.925488870783426e-06,
"loss": 0.08177490234375,
"memory(GiB)": 10.57,
"step": 3215,
"train_speed(iter/s)": 0.475612
},
{
"epoch": 2.405006538389688,
"grad_norm": 0.24647777146358466,
"learning_rate": 3.919907637147102e-06,
"loss": 0.081903076171875,
"memory(GiB)": 10.57,
"step": 3220,
"train_speed(iter/s)": 0.475609
},
{
"epoch": 2.4087427610685594,
"grad_norm": 0.38090689812957224,
"learning_rate": 3.914322886933341e-06,
"loss": 0.064569091796875,
"memory(GiB)": 10.57,
"step": 3225,
"train_speed(iter/s)": 0.475619
},
{
"epoch": 2.412478983747431,
"grad_norm": 0.2666319657744909,
"learning_rate": 3.908734641491248e-06,
"loss": 0.077764892578125,
"memory(GiB)": 10.57,
"step": 3230,
"train_speed(iter/s)": 0.475645
},
{
"epoch": 2.416215206426303,
"grad_norm": 0.22804209432893346,
"learning_rate": 3.903142922183294e-06,
"loss": 0.070025634765625,
"memory(GiB)": 10.57,
"step": 3235,
"train_speed(iter/s)": 0.475584
},
{
"epoch": 2.4199514291051747,
"grad_norm": 0.23685896651720773,
"learning_rate": 3.897547750385226e-06,
"loss": 0.0831634521484375,
"memory(GiB)": 10.57,
"step": 3240,
"train_speed(iter/s)": 0.475578
},
{
"epoch": 2.4236876517840464,
"grad_norm": 0.2355129405846085,
"learning_rate": 3.891949147485989e-06,
"loss": 0.077679443359375,
"memory(GiB)": 10.57,
"step": 3245,
"train_speed(iter/s)": 0.47556
},
{
"epoch": 2.427423874462918,
"grad_norm": 0.38970162877110276,
"learning_rate": 3.886347134887647e-06,
"loss": 0.0797607421875,
"memory(GiB)": 10.57,
"step": 3250,
"train_speed(iter/s)": 0.475557
},
{
"epoch": 2.4311600971417895,
"grad_norm": 0.2697647074819102,
"learning_rate": 3.8807417340052964e-06,
"loss": 0.0737060546875,
"memory(GiB)": 10.57,
"step": 3255,
"train_speed(iter/s)": 0.475577
},
{
"epoch": 2.4348963198206612,
"grad_norm": 0.19920837434880515,
"learning_rate": 3.875132966266987e-06,
"loss": 0.0791748046875,
"memory(GiB)": 10.57,
"step": 3260,
"train_speed(iter/s)": 0.475596
},
{
"epoch": 2.438632542499533,
"grad_norm": 0.22217603367413016,
"learning_rate": 3.869520853113637e-06,
"loss": 0.07099609375,
"memory(GiB)": 10.57,
"step": 3265,
"train_speed(iter/s)": 0.475601
},
{
"epoch": 2.4423687651784047,
"grad_norm": 0.310354028282849,
"learning_rate": 3.863905415998958e-06,
"loss": 0.075830078125,
"memory(GiB)": 10.57,
"step": 3270,
"train_speed(iter/s)": 0.475595
},
{
"epoch": 2.4461049878572765,
"grad_norm": 0.2904199442330529,
"learning_rate": 3.858286676389363e-06,
"loss": 0.07169189453125,
"memory(GiB)": 10.57,
"step": 3275,
"train_speed(iter/s)": 0.475577
},
{
"epoch": 2.449841210536148,
"grad_norm": 0.2671154417988313,
"learning_rate": 3.852664655763891e-06,
"loss": 0.0576446533203125,
"memory(GiB)": 10.57,
"step": 3280,
"train_speed(iter/s)": 0.475573
},
{
"epoch": 2.4535774332150195,
"grad_norm": 0.2117803221633462,
"learning_rate": 3.8470393756141285e-06,
"loss": 0.070208740234375,
"memory(GiB)": 10.57,
"step": 3285,
"train_speed(iter/s)": 0.475569
},
{
"epoch": 2.4573136558938913,
"grad_norm": 0.28365805075568284,
"learning_rate": 3.8414108574441155e-06,
"loss": 0.07728271484375,
"memory(GiB)": 10.57,
"step": 3290,
"train_speed(iter/s)": 0.475604
},
{
"epoch": 2.461049878572763,
"grad_norm": 0.26559512910109384,
"learning_rate": 3.835779122770274e-06,
"loss": 0.07513427734375,
"memory(GiB)": 10.57,
"step": 3295,
"train_speed(iter/s)": 0.475628
},
{
"epoch": 2.4647861012516348,
"grad_norm": 0.31583700464598574,
"learning_rate": 3.830144193121321e-06,
"loss": 0.0657806396484375,
"memory(GiB)": 10.57,
"step": 3300,
"train_speed(iter/s)": 0.475643
},
{
"epoch": 2.468522323930506,
"grad_norm": 0.2884092438790019,
"learning_rate": 3.824506090038185e-06,
"loss": 0.091070556640625,
"memory(GiB)": 10.57,
"step": 3305,
"train_speed(iter/s)": 0.475667
},
{
"epoch": 2.472258546609378,
"grad_norm": 0.3977319977360202,
"learning_rate": 3.818864835073931e-06,
"loss": 0.0851806640625,
"memory(GiB)": 10.57,
"step": 3310,
"train_speed(iter/s)": 0.475693
},
{
"epoch": 2.4759947692882496,
"grad_norm": 0.3494999636811868,
"learning_rate": 3.813220449793667e-06,
"loss": 0.064434814453125,
"memory(GiB)": 10.57,
"step": 3315,
"train_speed(iter/s)": 0.475688
},
{
"epoch": 2.4797309919671213,
"grad_norm": 0.17667298355698585,
"learning_rate": 3.8075729557744706e-06,
"loss": 0.06602783203125,
"memory(GiB)": 10.57,
"step": 3320,
"train_speed(iter/s)": 0.475718
},
{
"epoch": 2.483467214645993,
"grad_norm": 0.2847260138841454,
"learning_rate": 3.8019223746053037e-06,
"loss": 0.0813232421875,
"memory(GiB)": 10.57,
"step": 3325,
"train_speed(iter/s)": 0.47572
},
{
"epoch": 2.4872034373248644,
"grad_norm": 0.3276391701017016,
"learning_rate": 3.7962687278869266e-06,
"loss": 0.084173583984375,
"memory(GiB)": 10.57,
"step": 3330,
"train_speed(iter/s)": 0.47573
},
{
"epoch": 2.490939660003736,
"grad_norm": 0.20750116064295474,
"learning_rate": 3.7906120372318237e-06,
"loss": 0.055908203125,
"memory(GiB)": 10.57,
"step": 3335,
"train_speed(iter/s)": 0.475771
},
{
"epoch": 2.494675882682608,
"grad_norm": 0.21852160072540378,
"learning_rate": 3.784952324264109e-06,
"loss": 0.075030517578125,
"memory(GiB)": 10.57,
"step": 3340,
"train_speed(iter/s)": 0.475804
},
{
"epoch": 2.4984121053614796,
"grad_norm": 0.24279228051631654,
"learning_rate": 3.779289610619455e-06,
"loss": 0.07666015625,
"memory(GiB)": 10.57,
"step": 3345,
"train_speed(iter/s)": 0.475805
},
{
"epoch": 2.5021483280403514,
"grad_norm": 0.2904472098375547,
"learning_rate": 3.773623917945004e-06,
"loss": 0.092840576171875,
"memory(GiB)": 10.57,
"step": 3350,
"train_speed(iter/s)": 0.475809
},
{
"epoch": 2.505884550719223,
"grad_norm": 0.3311881989863495,
"learning_rate": 3.7679552678992854e-06,
"loss": 0.07431640625,
"memory(GiB)": 10.57,
"step": 3355,
"train_speed(iter/s)": 0.475802
},
{
"epoch": 2.5096207733980944,
"grad_norm": 0.347020365516737,
"learning_rate": 3.7622836821521346e-06,
"loss": 0.083404541015625,
"memory(GiB)": 10.57,
"step": 3360,
"train_speed(iter/s)": 0.475755
},
{
"epoch": 2.513356996076966,
"grad_norm": 0.30218078744076704,
"learning_rate": 3.7566091823846082e-06,
"loss": 0.080633544921875,
"memory(GiB)": 10.57,
"step": 3365,
"train_speed(iter/s)": 0.475751
},
{
"epoch": 2.517093218755838,
"grad_norm": 0.19250830743626035,
"learning_rate": 3.750931790288904e-06,
"loss": 0.070989990234375,
"memory(GiB)": 10.57,
"step": 3370,
"train_speed(iter/s)": 0.475766
},
{
"epoch": 2.5208294414347097,
"grad_norm": 0.3140116665074889,
"learning_rate": 3.745251527568276e-06,
"loss": 0.08988037109375,
"memory(GiB)": 10.57,
"step": 3375,
"train_speed(iter/s)": 0.475765
},
{
"epoch": 2.524565664113581,
"grad_norm": 0.27965921080609724,
"learning_rate": 3.7395684159369515e-06,
"loss": 0.0727783203125,
"memory(GiB)": 10.57,
"step": 3380,
"train_speed(iter/s)": 0.475783
},
{
"epoch": 2.5283018867924527,
"grad_norm": 0.2825039712001602,
"learning_rate": 3.733882477120049e-06,
"loss": 0.07235107421875,
"memory(GiB)": 10.57,
"step": 3385,
"train_speed(iter/s)": 0.475777
},
{
"epoch": 2.5320381094713245,
"grad_norm": 0.2817704189737431,
"learning_rate": 3.7281937328534927e-06,
"loss": 0.07215576171875,
"memory(GiB)": 10.57,
"step": 3390,
"train_speed(iter/s)": 0.475785
},
{
"epoch": 2.535774332150196,
"grad_norm": 0.2984895644961484,
"learning_rate": 3.7225022048839364e-06,
"loss": 0.07979736328125,
"memory(GiB)": 10.57,
"step": 3395,
"train_speed(iter/s)": 0.475804
},
{
"epoch": 2.539510554829068,
"grad_norm": 0.4297688864469516,
"learning_rate": 3.716807914968669e-06,
"loss": 0.0768310546875,
"memory(GiB)": 10.57,
"step": 3400,
"train_speed(iter/s)": 0.475802
},
{
"epoch": 2.5432467775079397,
"grad_norm": 0.2540092842763994,
"learning_rate": 3.7111108848755407e-06,
"loss": 0.080731201171875,
"memory(GiB)": 10.57,
"step": 3405,
"train_speed(iter/s)": 0.475804
},
{
"epoch": 2.546983000186811,
"grad_norm": 0.218855865695132,
"learning_rate": 3.705411136382877e-06,
"loss": 0.07509765625,
"memory(GiB)": 10.57,
"step": 3410,
"train_speed(iter/s)": 0.475824
},
{
"epoch": 2.5507192228656828,
"grad_norm": 0.31386617014735185,
"learning_rate": 3.6997086912793953e-06,
"loss": 0.08365478515625,
"memory(GiB)": 10.57,
"step": 3415,
"train_speed(iter/s)": 0.475796
},
{
"epoch": 2.5544554455445545,
"grad_norm": 0.2888393651203557,
"learning_rate": 3.69400357136412e-06,
"loss": 0.08245849609375,
"memory(GiB)": 10.57,
"step": 3420,
"train_speed(iter/s)": 0.475804
},
{
"epoch": 2.5581916682234263,
"grad_norm": 0.518767980813791,
"learning_rate": 3.6882957984463014e-06,
"loss": 0.084869384765625,
"memory(GiB)": 10.57,
"step": 3425,
"train_speed(iter/s)": 0.475798
},
{
"epoch": 2.5619278909022976,
"grad_norm": 0.24055934018386763,
"learning_rate": 3.6825853943453326e-06,
"loss": 0.07509765625,
"memory(GiB)": 10.57,
"step": 3430,
"train_speed(iter/s)": 0.475815
},
{
"epoch": 2.5656641135811693,
"grad_norm": 0.11607703015154515,
"learning_rate": 3.6768723808906624e-06,
"loss": 0.0733642578125,
"memory(GiB)": 10.57,
"step": 3435,
"train_speed(iter/s)": 0.475839
},
{
"epoch": 2.569400336260041,
"grad_norm": 0.2621128311109813,
"learning_rate": 3.6711567799217177e-06,
"loss": 0.07127685546875,
"memory(GiB)": 10.57,
"step": 3440,
"train_speed(iter/s)": 0.475869
},
{
"epoch": 2.573136558938913,
"grad_norm": 0.4650255643831401,
"learning_rate": 3.6654386132878153e-06,
"loss": 0.07940673828125,
"memory(GiB)": 10.57,
"step": 3445,
"train_speed(iter/s)": 0.475873
},
{
"epoch": 2.5768727816177845,
"grad_norm": 0.3724024885268326,
"learning_rate": 3.659717902848079e-06,
"loss": 0.07889404296875,
"memory(GiB)": 10.57,
"step": 3450,
"train_speed(iter/s)": 0.475871
},
{
"epoch": 2.5806090042966563,
"grad_norm": 0.23714008480261214,
"learning_rate": 3.653994670471358e-06,
"loss": 0.062042236328125,
"memory(GiB)": 10.57,
"step": 3455,
"train_speed(iter/s)": 0.475898
},
{
"epoch": 2.5843452269755276,
"grad_norm": 0.38138493209988716,
"learning_rate": 3.6482689380361434e-06,
"loss": 0.078564453125,
"memory(GiB)": 10.57,
"step": 3460,
"train_speed(iter/s)": 0.47589
},
{
"epoch": 2.5880814496543993,
"grad_norm": 0.2790205903786827,
"learning_rate": 3.6425407274304794e-06,
"loss": 0.07850341796875,
"memory(GiB)": 10.57,
"step": 3465,
"train_speed(iter/s)": 0.475897
},
{
"epoch": 2.591817672333271,
"grad_norm": 0.28268894066227623,
"learning_rate": 3.6368100605518895e-06,
"loss": 0.080084228515625,
"memory(GiB)": 10.57,
"step": 3470,
"train_speed(iter/s)": 0.4759
},
{
"epoch": 2.595553895012143,
"grad_norm": 0.40313615278345716,
"learning_rate": 3.631076959307282e-06,
"loss": 0.085107421875,
"memory(GiB)": 10.57,
"step": 3475,
"train_speed(iter/s)": 0.475908
},
{
"epoch": 2.599290117691014,
"grad_norm": 0.2734351199877751,
"learning_rate": 3.625341445612872e-06,
"loss": 0.084490966796875,
"memory(GiB)": 10.57,
"step": 3480,
"train_speed(iter/s)": 0.475939
},
{
"epoch": 2.603026340369886,
"grad_norm": 0.24165164144941384,
"learning_rate": 3.6196035413941004e-06,
"loss": 0.075732421875,
"memory(GiB)": 10.57,
"step": 3485,
"train_speed(iter/s)": 0.475926
},
{
"epoch": 2.6067625630487576,
"grad_norm": 0.22587276792049774,
"learning_rate": 3.6138632685855416e-06,
"loss": 0.06920166015625,
"memory(GiB)": 10.57,
"step": 3490,
"train_speed(iter/s)": 0.47595
},
{
"epoch": 2.6104987857276294,
"grad_norm": 0.26274757578605296,
"learning_rate": 3.608120649130827e-06,
"loss": 0.06964111328125,
"memory(GiB)": 10.57,
"step": 3495,
"train_speed(iter/s)": 0.475958
},
{
"epoch": 2.614235008406501,
"grad_norm": 0.2791749381588521,
"learning_rate": 3.602375704982559e-06,
"loss": 0.082159423828125,
"memory(GiB)": 10.57,
"step": 3500,
"train_speed(iter/s)": 0.475942
},
{
"epoch": 2.617971231085373,
"grad_norm": 0.19097386934636804,
"learning_rate": 3.5966284581022256e-06,
"loss": 0.071124267578125,
"memory(GiB)": 10.57,
"step": 3505,
"train_speed(iter/s)": 0.475946
},
{
"epoch": 2.621707453764244,
"grad_norm": 0.30489359623246215,
"learning_rate": 3.5908789304601187e-06,
"loss": 0.0773193359375,
"memory(GiB)": 10.57,
"step": 3510,
"train_speed(iter/s)": 0.475924
},
{
"epoch": 2.625443676443116,
"grad_norm": 0.3251670210353117,
"learning_rate": 3.585127144035247e-06,
"loss": 0.0652557373046875,
"memory(GiB)": 10.57,
"step": 3515,
"train_speed(iter/s)": 0.475915
},
{
"epoch": 2.6291798991219877,
"grad_norm": 0.47973710424124294,
"learning_rate": 3.579373120815257e-06,
"loss": 0.0652099609375,
"memory(GiB)": 10.57,
"step": 3520,
"train_speed(iter/s)": 0.47582
},
{
"epoch": 2.6329161218008594,
"grad_norm": 0.251813320258894,
"learning_rate": 3.5736168827963423e-06,
"loss": 0.0735595703125,
"memory(GiB)": 10.57,
"step": 3525,
"train_speed(iter/s)": 0.475822
},
{
"epoch": 2.6366523444797307,
"grad_norm": 0.16642948523661447,
"learning_rate": 3.567858451983167e-06,
"loss": 0.0711456298828125,
"memory(GiB)": 10.57,
"step": 3530,
"train_speed(iter/s)": 0.475776
},
{
"epoch": 2.6403885671586025,
"grad_norm": 0.2232206082433094,
"learning_rate": 3.562097850388775e-06,
"loss": 0.08082275390625,
"memory(GiB)": 10.57,
"step": 3535,
"train_speed(iter/s)": 0.475792
},
{
"epoch": 2.6441247898374742,
"grad_norm": 0.29955499401855273,
"learning_rate": 3.5563351000345077e-06,
"loss": 0.06729736328125,
"memory(GiB)": 10.57,
"step": 3540,
"train_speed(iter/s)": 0.475806
},
{
"epoch": 2.647861012516346,
"grad_norm": 0.3399121760483779,
"learning_rate": 3.5505702229499243e-06,
"loss": 0.0638671875,
"memory(GiB)": 10.57,
"step": 3545,
"train_speed(iter/s)": 0.475786
},
{
"epoch": 2.6515972351952177,
"grad_norm": 0.24813478944145864,
"learning_rate": 3.5448032411727123e-06,
"loss": 0.073760986328125,
"memory(GiB)": 10.57,
"step": 3550,
"train_speed(iter/s)": 0.475775
},
{
"epoch": 2.6553334578740895,
"grad_norm": 0.20754012538401892,
"learning_rate": 3.539034176748602e-06,
"loss": 0.069378662109375,
"memory(GiB)": 10.57,
"step": 3555,
"train_speed(iter/s)": 0.475759
},
{
"epoch": 2.6590696805529612,
"grad_norm": 0.3300071479044449,
"learning_rate": 3.53326305173129e-06,
"loss": 0.0831787109375,
"memory(GiB)": 10.57,
"step": 3560,
"train_speed(iter/s)": 0.475747
},
{
"epoch": 2.6628059032318325,
"grad_norm": 0.2418845408277716,
"learning_rate": 3.5274898881823466e-06,
"loss": 0.0650390625,
"memory(GiB)": 10.57,
"step": 3565,
"train_speed(iter/s)": 0.475754
},
{
"epoch": 2.6665421259107043,
"grad_norm": 0.191875325205025,
"learning_rate": 3.5217147081711363e-06,
"loss": 0.07650146484375,
"memory(GiB)": 10.57,
"step": 3570,
"train_speed(iter/s)": 0.475774
},
{
"epoch": 2.670278348589576,
"grad_norm": 0.2918403056701858,
"learning_rate": 3.515937533774732e-06,
"loss": 0.0787841796875,
"memory(GiB)": 10.57,
"step": 3575,
"train_speed(iter/s)": 0.475801
},
{
"epoch": 2.6740145712684473,
"grad_norm": 0.2103497141365804,
"learning_rate": 3.51015838707783e-06,
"loss": 0.083331298828125,
"memory(GiB)": 10.57,
"step": 3580,
"train_speed(iter/s)": 0.475836
},
{
"epoch": 2.677750793947319,
"grad_norm": 0.15535646417219773,
"learning_rate": 3.504377290172666e-06,
"loss": 0.0805419921875,
"memory(GiB)": 10.57,
"step": 3585,
"train_speed(iter/s)": 0.475811
},
{
"epoch": 2.681487016626191,
"grad_norm": 0.2156487636541889,
"learning_rate": 3.498594265158933e-06,
"loss": 0.0731689453125,
"memory(GiB)": 10.57,
"step": 3590,
"train_speed(iter/s)": 0.47582
},
{
"epoch": 2.6852232393050626,
"grad_norm": 0.31756593216849865,
"learning_rate": 3.4928093341436915e-06,
"loss": 0.08016357421875,
"memory(GiB)": 10.57,
"step": 3595,
"train_speed(iter/s)": 0.475826
},
{
"epoch": 2.6889594619839343,
"grad_norm": 0.17993011176812954,
"learning_rate": 3.4870225192412908e-06,
"loss": 0.068292236328125,
"memory(GiB)": 10.57,
"step": 3600,
"train_speed(iter/s)": 0.475817
},
{
"epoch": 2.692695684662806,
"grad_norm": 0.2563812995989066,
"learning_rate": 3.4812338425732808e-06,
"loss": 0.09036865234375,
"memory(GiB)": 10.57,
"step": 3605,
"train_speed(iter/s)": 0.475841
},
{
"epoch": 2.696431907341678,
"grad_norm": 0.21729858304510458,
"learning_rate": 3.4754433262683286e-06,
"loss": 0.070880126953125,
"memory(GiB)": 10.57,
"step": 3610,
"train_speed(iter/s)": 0.475864
},
{
"epoch": 2.700168130020549,
"grad_norm": 0.4448881083896266,
"learning_rate": 3.4696509924621324e-06,
"loss": 0.090478515625,
"memory(GiB)": 10.57,
"step": 3615,
"train_speed(iter/s)": 0.475831
},
{
"epoch": 2.703904352699421,
"grad_norm": 0.29692075196588846,
"learning_rate": 3.463856863297341e-06,
"loss": 0.078076171875,
"memory(GiB)": 10.57,
"step": 3620,
"train_speed(iter/s)": 0.475848
},
{
"epoch": 2.7076405753782926,
"grad_norm": 0.31954279997414836,
"learning_rate": 3.4580609609234648e-06,
"loss": 0.07919921875,
"memory(GiB)": 10.57,
"step": 3625,
"train_speed(iter/s)": 0.475834
},
{
"epoch": 2.7113767980571644,
"grad_norm": 0.1723702450513143,
"learning_rate": 3.4522633074967915e-06,
"loss": 0.074517822265625,
"memory(GiB)": 10.57,
"step": 3630,
"train_speed(iter/s)": 0.475811
},
{
"epoch": 2.7151130207360357,
"grad_norm": 0.22262320422842827,
"learning_rate": 3.4464639251803052e-06,
"loss": 0.070367431640625,
"memory(GiB)": 10.57,
"step": 3635,
"train_speed(iter/s)": 0.475826
},
{
"epoch": 2.7188492434149074,
"grad_norm": 0.28450955603049155,
"learning_rate": 3.4406628361435986e-06,
"loss": 0.08800048828125,
"memory(GiB)": 10.57,
"step": 3640,
"train_speed(iter/s)": 0.475849
},
{
"epoch": 2.722585466093779,
"grad_norm": 0.3537764688990701,
"learning_rate": 3.4348600625627853e-06,
"loss": 0.08115081787109375,
"memory(GiB)": 10.57,
"step": 3645,
"train_speed(iter/s)": 0.475856
},
{
"epoch": 2.726321688772651,
"grad_norm": 0.2717562915869466,
"learning_rate": 3.4290556266204255e-06,
"loss": 0.06995849609375,
"memory(GiB)": 10.57,
"step": 3650,
"train_speed(iter/s)": 0.475855
},
{
"epoch": 2.7300579114515227,
"grad_norm": 0.22750796325018738,
"learning_rate": 3.4232495505054263e-06,
"loss": 0.071771240234375,
"memory(GiB)": 10.57,
"step": 3655,
"train_speed(iter/s)": 0.475875
},
{
"epoch": 2.7337941341303944,
"grad_norm": 0.15412260555395027,
"learning_rate": 3.4174418564129683e-06,
"loss": 0.07366943359375,
"memory(GiB)": 10.57,
"step": 3660,
"train_speed(iter/s)": 0.475851
},
{
"epoch": 2.7375303568092657,
"grad_norm": 0.22006647714355373,
"learning_rate": 3.4116325665444205e-06,
"loss": 0.07138671875,
"memory(GiB)": 10.57,
"step": 3665,
"train_speed(iter/s)": 0.475871
},
{
"epoch": 2.7412665794881375,
"grad_norm": 0.42373302378912014,
"learning_rate": 3.405821703107247e-06,
"loss": 0.081640625,
"memory(GiB)": 10.57,
"step": 3670,
"train_speed(iter/s)": 0.475866
},
{
"epoch": 2.745002802167009,
"grad_norm": 0.25034251347665165,
"learning_rate": 3.4000092883149293e-06,
"loss": 0.07459716796875,
"memory(GiB)": 10.57,
"step": 3675,
"train_speed(iter/s)": 0.475862
},
{
"epoch": 2.748739024845881,
"grad_norm": 0.26815460719783096,
"learning_rate": 3.3941953443868794e-06,
"loss": 0.0758056640625,
"memory(GiB)": 10.57,
"step": 3680,
"train_speed(iter/s)": 0.475869
},
{
"epoch": 2.7524752475247523,
"grad_norm": 0.3488626865913072,
"learning_rate": 3.388379893548356e-06,
"loss": 0.076416015625,
"memory(GiB)": 10.57,
"step": 3685,
"train_speed(iter/s)": 0.475889
},
{
"epoch": 2.756211470203624,
"grad_norm": 0.2927879301365204,
"learning_rate": 3.382562958030375e-06,
"loss": 0.072265625,
"memory(GiB)": 10.57,
"step": 3690,
"train_speed(iter/s)": 0.475894
},
{
"epoch": 2.7599476928824958,
"grad_norm": 0.39819039701808595,
"learning_rate": 3.376744560069631e-06,
"loss": 0.0801025390625,
"memory(GiB)": 10.57,
"step": 3695,
"train_speed(iter/s)": 0.475889
},
{
"epoch": 2.7636839155613675,
"grad_norm": 0.27836721809953646,
"learning_rate": 3.370924721908408e-06,
"loss": 0.081817626953125,
"memory(GiB)": 10.57,
"step": 3700,
"train_speed(iter/s)": 0.475851
},
{
"epoch": 2.7674201382402392,
"grad_norm": 0.3159510466408062,
"learning_rate": 3.3651034657944944e-06,
"loss": 0.09007568359375,
"memory(GiB)": 10.57,
"step": 3705,
"train_speed(iter/s)": 0.475839
},
{
"epoch": 2.771156360919111,
"grad_norm": 0.2482343530491869,
"learning_rate": 3.3592808139811034e-06,
"loss": 0.08701171875,
"memory(GiB)": 10.57,
"step": 3710,
"train_speed(iter/s)": 0.475854
},
{
"epoch": 2.7748925835979823,
"grad_norm": 0.2212717362508163,
"learning_rate": 3.353456788726778e-06,
"loss": 0.089019775390625,
"memory(GiB)": 10.57,
"step": 3715,
"train_speed(iter/s)": 0.475852
},
{
"epoch": 2.778628806276854,
"grad_norm": 0.3180240539309867,
"learning_rate": 3.347631412295314e-06,
"loss": 0.078448486328125,
"memory(GiB)": 10.57,
"step": 3720,
"train_speed(iter/s)": 0.475768
},
{
"epoch": 2.782365028955726,
"grad_norm": 0.19694686614220888,
"learning_rate": 3.341804706955673e-06,
"loss": 0.071771240234375,
"memory(GiB)": 10.57,
"step": 3725,
"train_speed(iter/s)": 0.475772
},
{
"epoch": 2.7861012516345975,
"grad_norm": 0.27207148460273645,
"learning_rate": 3.335976694981898e-06,
"loss": 0.071990966796875,
"memory(GiB)": 10.57,
"step": 3730,
"train_speed(iter/s)": 0.475786
},
{
"epoch": 2.789837474313469,
"grad_norm": 0.2784440972147361,
"learning_rate": 3.3301473986530204e-06,
"loss": 0.08033447265625,
"memory(GiB)": 10.57,
"step": 3735,
"train_speed(iter/s)": 0.475803
},
{
"epoch": 2.7935736969923406,
"grad_norm": 0.384630172157372,
"learning_rate": 3.3243168402529903e-06,
"loss": 0.07603759765625,
"memory(GiB)": 10.57,
"step": 3740,
"train_speed(iter/s)": 0.475835
},
{
"epoch": 2.7973099196712123,
"grad_norm": 0.3015764425828606,
"learning_rate": 3.318485042070576e-06,
"loss": 0.070220947265625,
"memory(GiB)": 10.57,
"step": 3745,
"train_speed(iter/s)": 0.475867
},
{
"epoch": 2.801046142350084,
"grad_norm": 0.33638080331152426,
"learning_rate": 3.3126520263992883e-06,
"loss": 0.078277587890625,
"memory(GiB)": 10.57,
"step": 3750,
"train_speed(iter/s)": 0.475859
},
{
"epoch": 2.804782365028956,
"grad_norm": 0.2624352148398618,
"learning_rate": 3.306817815537291e-06,
"loss": 0.0696044921875,
"memory(GiB)": 10.57,
"step": 3755,
"train_speed(iter/s)": 0.47588
},
{
"epoch": 2.8085185877078276,
"grad_norm": 0.27781369223511193,
"learning_rate": 3.3009824317873164e-06,
"loss": 0.058050537109375,
"memory(GiB)": 10.57,
"step": 3760,
"train_speed(iter/s)": 0.475896
},
{
"epoch": 2.812254810386699,
"grad_norm": 0.1340015202269091,
"learning_rate": 3.2951458974565808e-06,
"loss": 0.08018798828125,
"memory(GiB)": 10.57,
"step": 3765,
"train_speed(iter/s)": 0.475889
},
{
"epoch": 2.8159910330655706,
"grad_norm": 0.20980509524344693,
"learning_rate": 3.2893082348567e-06,
"loss": 0.069110107421875,
"memory(GiB)": 10.57,
"step": 3770,
"train_speed(iter/s)": 0.475909
},
{
"epoch": 2.8197272557444424,
"grad_norm": 0.2501876137757298,
"learning_rate": 3.2834694663036016e-06,
"loss": 0.07905120849609375,
"memory(GiB)": 10.57,
"step": 3775,
"train_speed(iter/s)": 0.475912
},
{
"epoch": 2.823463478423314,
"grad_norm": 0.23719398237463618,
"learning_rate": 3.2776296141174405e-06,
"loss": 0.07977294921875,
"memory(GiB)": 10.57,
"step": 3780,
"train_speed(iter/s)": 0.475894
},
{
"epoch": 2.8271997011021854,
"grad_norm": 0.2019112294748815,
"learning_rate": 3.271788700622517e-06,
"loss": 0.067169189453125,
"memory(GiB)": 10.57,
"step": 3785,
"train_speed(iter/s)": 0.475868
},
{
"epoch": 2.830935923781057,
"grad_norm": 0.2569708345412305,
"learning_rate": 3.265946748147185e-06,
"loss": 0.08135986328125,
"memory(GiB)": 10.57,
"step": 3790,
"train_speed(iter/s)": 0.475837
},
{
"epoch": 2.834672146459929,
"grad_norm": 0.3039925539901921,
"learning_rate": 3.2601037790237713e-06,
"loss": 0.0752685546875,
"memory(GiB)": 10.57,
"step": 3795,
"train_speed(iter/s)": 0.475822
},
{
"epoch": 2.8384083691388007,
"grad_norm": 0.31263171782477395,
"learning_rate": 3.2542598155884905e-06,
"loss": 0.079345703125,
"memory(GiB)": 10.57,
"step": 3800,
"train_speed(iter/s)": 0.475843
},
{
"epoch": 2.8421445918176724,
"grad_norm": 0.3194717938532269,
"learning_rate": 3.2484148801813564e-06,
"loss": 0.0697540283203125,
"memory(GiB)": 10.57,
"step": 3805,
"train_speed(iter/s)": 0.475787
},
{
"epoch": 2.845880814496544,
"grad_norm": 0.2414957673117366,
"learning_rate": 3.242568995146099e-06,
"loss": 0.079833984375,
"memory(GiB)": 10.57,
"step": 3810,
"train_speed(iter/s)": 0.475812
},
{
"epoch": 2.849617037175416,
"grad_norm": 0.29766797126278466,
"learning_rate": 3.2367221828300797e-06,
"loss": 0.07156982421875,
"memory(GiB)": 10.57,
"step": 3815,
"train_speed(iter/s)": 0.475822
},
{
"epoch": 2.8533532598542872,
"grad_norm": 0.33562960678102366,
"learning_rate": 3.2308744655842023e-06,
"loss": 0.07691650390625,
"memory(GiB)": 10.57,
"step": 3820,
"train_speed(iter/s)": 0.475844
},
{
"epoch": 2.857089482533159,
"grad_norm": 0.23249083043588517,
"learning_rate": 3.2250258657628317e-06,
"loss": 0.0674591064453125,
"memory(GiB)": 10.57,
"step": 3825,
"train_speed(iter/s)": 0.475864
},
{
"epoch": 2.8608257052120307,
"grad_norm": 0.3868842372829782,
"learning_rate": 3.2191764057237057e-06,
"loss": 0.0788818359375,
"memory(GiB)": 10.57,
"step": 3830,
"train_speed(iter/s)": 0.475867
},
{
"epoch": 2.864561927890902,
"grad_norm": 0.2721643307415772,
"learning_rate": 3.2133261078278516e-06,
"loss": 0.076806640625,
"memory(GiB)": 10.57,
"step": 3835,
"train_speed(iter/s)": 0.475878
},
{
"epoch": 2.8682981505697738,
"grad_norm": 0.2509409981744641,
"learning_rate": 3.207474994439499e-06,
"loss": 0.07947998046875,
"memory(GiB)": 10.57,
"step": 3840,
"train_speed(iter/s)": 0.475893
},
{
"epoch": 2.8720343732486455,
"grad_norm": 0.2985391643876752,
"learning_rate": 3.2016230879259938e-06,
"loss": 0.08131103515625,
"memory(GiB)": 10.57,
"step": 3845,
"train_speed(iter/s)": 0.475879
},
{
"epoch": 2.8757705959275173,
"grad_norm": 0.34684741561716165,
"learning_rate": 3.195770410657717e-06,
"loss": 0.082269287109375,
"memory(GiB)": 10.57,
"step": 3850,
"train_speed(iter/s)": 0.475847
},
{
"epoch": 2.879506818606389,
"grad_norm": 0.23479279469344572,
"learning_rate": 3.189916985007991e-06,
"loss": 0.09420166015625,
"memory(GiB)": 10.57,
"step": 3855,
"train_speed(iter/s)": 0.475813
},
{
"epoch": 2.8832430412852608,
"grad_norm": 0.3907742470555341,
"learning_rate": 3.184062833353005e-06,
"loss": 0.07618408203125,
"memory(GiB)": 10.57,
"step": 3860,
"train_speed(iter/s)": 0.475799
},
{
"epoch": 2.8869792639641325,
"grad_norm": 0.19372123225177681,
"learning_rate": 3.178207978071719e-06,
"loss": 0.079144287109375,
"memory(GiB)": 10.57,
"step": 3865,
"train_speed(iter/s)": 0.475828
},
{
"epoch": 2.890715486643004,
"grad_norm": 0.3425380929045749,
"learning_rate": 3.1723524415457845e-06,
"loss": 0.085382080078125,
"memory(GiB)": 10.57,
"step": 3870,
"train_speed(iter/s)": 0.475816
},
{
"epoch": 2.8944517093218756,
"grad_norm": 0.3609396149048238,
"learning_rate": 3.166496246159457e-06,
"loss": 0.070849609375,
"memory(GiB)": 10.57,
"step": 3875,
"train_speed(iter/s)": 0.475828
},
{
"epoch": 2.8981879320007473,
"grad_norm": 0.20183491005738083,
"learning_rate": 3.160639414299511e-06,
"loss": 0.074884033203125,
"memory(GiB)": 10.57,
"step": 3880,
"train_speed(iter/s)": 0.475821
},
{
"epoch": 2.901924154679619,
"grad_norm": 0.2467148593569697,
"learning_rate": 3.154781968355153e-06,
"loss": 0.063775634765625,
"memory(GiB)": 10.57,
"step": 3885,
"train_speed(iter/s)": 0.475846
},
{
"epoch": 2.9056603773584904,
"grad_norm": 0.2637999747733018,
"learning_rate": 3.148923930717939e-06,
"loss": 0.0755615234375,
"memory(GiB)": 10.57,
"step": 3890,
"train_speed(iter/s)": 0.475849
},
{
"epoch": 2.909396600037362,
"grad_norm": 0.25527787190645407,
"learning_rate": 3.143065323781685e-06,
"loss": 0.06624755859375,
"memory(GiB)": 10.57,
"step": 3895,
"train_speed(iter/s)": 0.475865
},
{
"epoch": 2.913132822716234,
"grad_norm": 0.30417828277097125,
"learning_rate": 3.137206169942384e-06,
"loss": 0.073992919921875,
"memory(GiB)": 10.57,
"step": 3900,
"train_speed(iter/s)": 0.475832
},
{
"epoch": 2.9168690453951056,
"grad_norm": 0.2346109435926227,
"learning_rate": 3.131346491598119e-06,
"loss": 0.07637939453125,
"memory(GiB)": 10.57,
"step": 3905,
"train_speed(iter/s)": 0.47584
},
{
"epoch": 2.9206052680739774,
"grad_norm": 0.2353613119236764,
"learning_rate": 3.1254863111489804e-06,
"loss": 0.081158447265625,
"memory(GiB)": 10.57,
"step": 3910,
"train_speed(iter/s)": 0.475845
},
{
"epoch": 2.924341490752849,
"grad_norm": 0.3558838314274693,
"learning_rate": 3.119625650996974e-06,
"loss": 0.076300048828125,
"memory(GiB)": 10.57,
"step": 3915,
"train_speed(iter/s)": 0.475836
},
{
"epoch": 2.9280777134317204,
"grad_norm": 0.27354688251249265,
"learning_rate": 3.1137645335459434e-06,
"loss": 0.073907470703125,
"memory(GiB)": 10.57,
"step": 3920,
"train_speed(iter/s)": 0.475809
},
{
"epoch": 2.931813936110592,
"grad_norm": 0.3327608490083812,
"learning_rate": 3.107902981201478e-06,
"loss": 0.07683868408203125,
"memory(GiB)": 10.57,
"step": 3925,
"train_speed(iter/s)": 0.475779
},
{
"epoch": 2.935550158789464,
"grad_norm": 0.3747363988689518,
"learning_rate": 3.1020410163708304e-06,
"loss": 0.074114990234375,
"memory(GiB)": 10.57,
"step": 3930,
"train_speed(iter/s)": 0.475764
},
{
"epoch": 2.9392863814683357,
"grad_norm": 0.18606776814447884,
"learning_rate": 3.0961786614628308e-06,
"loss": 0.073858642578125,
"memory(GiB)": 10.57,
"step": 3935,
"train_speed(iter/s)": 0.475783
},
{
"epoch": 2.943022604147207,
"grad_norm": 0.22753548240298943,
"learning_rate": 3.0903159388877984e-06,
"loss": 0.07952880859375,
"memory(GiB)": 10.57,
"step": 3940,
"train_speed(iter/s)": 0.475798
},
{
"epoch": 2.9467588268260787,
"grad_norm": 0.2665097861133451,
"learning_rate": 3.0844528710574603e-06,
"loss": 0.08333740234375,
"memory(GiB)": 10.57,
"step": 3945,
"train_speed(iter/s)": 0.475797
},
{
"epoch": 2.9504950495049505,
"grad_norm": 0.17698058114731188,
"learning_rate": 3.0785894803848617e-06,
"loss": 0.069122314453125,
"memory(GiB)": 10.57,
"step": 3950,
"train_speed(iter/s)": 0.475778
},
{
"epoch": 2.954231272183822,
"grad_norm": 0.3104099022805613,
"learning_rate": 3.072725789284282e-06,
"loss": 0.062646484375,
"memory(GiB)": 10.57,
"step": 3955,
"train_speed(iter/s)": 0.475745
},
{
"epoch": 2.957967494862694,
"grad_norm": 0.20315634133154128,
"learning_rate": 3.0668618201711517e-06,
"loss": 0.08089599609375,
"memory(GiB)": 10.57,
"step": 3960,
"train_speed(iter/s)": 0.475758
},
{
"epoch": 2.9617037175415657,
"grad_norm": 0.25055279371623723,
"learning_rate": 3.0609975954619585e-06,
"loss": 0.070599365234375,
"memory(GiB)": 10.57,
"step": 3965,
"train_speed(iter/s)": 0.475775
},
{
"epoch": 2.965439940220437,
"grad_norm": 0.27358700735815494,
"learning_rate": 3.0551331375741753e-06,
"loss": 0.079913330078125,
"memory(GiB)": 10.57,
"step": 3970,
"train_speed(iter/s)": 0.475795
},
{
"epoch": 2.9691761628993087,
"grad_norm": 0.2701014272775072,
"learning_rate": 3.0492684689261587e-06,
"loss": 0.069427490234375,
"memory(GiB)": 10.57,
"step": 3975,
"train_speed(iter/s)": 0.475767
},
{
"epoch": 2.9729123855781805,
"grad_norm": 0.26839228857427083,
"learning_rate": 3.0434036119370734e-06,
"loss": 0.07572021484375,
"memory(GiB)": 10.57,
"step": 3980,
"train_speed(iter/s)": 0.475785
},
{
"epoch": 2.9766486082570522,
"grad_norm": 0.22716855276596393,
"learning_rate": 3.037538589026808e-06,
"loss": 0.08402099609375,
"memory(GiB)": 10.57,
"step": 3985,
"train_speed(iter/s)": 0.475805
},
{
"epoch": 2.9803848309359235,
"grad_norm": 0.2867732902522501,
"learning_rate": 3.03167342261588e-06,
"loss": 0.06982421875,
"memory(GiB)": 10.57,
"step": 3990,
"train_speed(iter/s)": 0.475802
},
{
"epoch": 2.9841210536147953,
"grad_norm": 0.1859176777096869,
"learning_rate": 3.0258081351253565e-06,
"loss": 0.073046875,
"memory(GiB)": 10.57,
"step": 3995,
"train_speed(iter/s)": 0.475829
},
{
"epoch": 2.987857276293667,
"grad_norm": 0.28880199434249176,
"learning_rate": 3.019942748976771e-06,
"loss": 0.092022705078125,
"memory(GiB)": 10.57,
"step": 4000,
"train_speed(iter/s)": 0.475846
},
{
"epoch": 2.991593498972539,
"grad_norm": 0.33890813145931753,
"learning_rate": 3.0140772865920308e-06,
"loss": 0.076885986328125,
"memory(GiB)": 10.57,
"step": 4005,
"train_speed(iter/s)": 0.475863
},
{
"epoch": 2.9953297216514105,
"grad_norm": 0.237266786675584,
"learning_rate": 3.0082117703933345e-06,
"loss": 0.088226318359375,
"memory(GiB)": 10.57,
"step": 4010,
"train_speed(iter/s)": 0.475845
},
{
"epoch": 2.9990659443302823,
"grad_norm": 0.2422362583040606,
"learning_rate": 3.002346222803089e-06,
"loss": 0.0780517578125,
"memory(GiB)": 10.57,
"step": 4015,
"train_speed(iter/s)": 0.475845
}
],
"logging_steps": 5,
"max_steps": 8034,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 1339,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 502166754164736.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}