Training in progress, step 7500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:445a128b149954e68d1af5a00630de0dc09e06cb78963d856ab9efe3a52157d9
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66488112339a052865703e73eb9d72b3f5f142ea84ea68d0b968dcf9eb080bb8
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88ec7f0fcbb8e83ac60a847dffeda029d1a65c084556d4707d85ad106bc04ba0
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6308,6 +6308,456 @@
|
|
| 6308 |
"mean_token_accuracy": 0.7988959193229676,
|
| 6309 |
"num_tokens": 7754571.0,
|
| 6310 |
"step": 7000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6311 |
}
|
| 6312 |
],
|
| 6313 |
"logging_steps": 10,
|
|
@@ -6327,7 +6777,7 @@
|
|
| 6327 |
"attributes": {}
|
| 6328 |
}
|
| 6329 |
},
|
| 6330 |
-
"total_flos":
|
| 6331 |
"train_batch_size": 8,
|
| 6332 |
"trial_name": null,
|
| 6333 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.5111827523675196,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 7500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6308 |
"mean_token_accuracy": 0.7988959193229676,
|
| 6309 |
"num_tokens": 7754571.0,
|
| 6310 |
"step": 7000
|
| 6311 |
+
},
|
| 6312 |
+
{
|
| 6313 |
+
"epoch": 1.4124521458795083,
|
| 6314 |
+
"grad_norm": 12.5625,
|
| 6315 |
+
"learning_rate": 1.0584995634360939e-05,
|
| 6316 |
+
"loss": 0.8291,
|
| 6317 |
+
"mean_token_accuracy": 0.7979937255382538,
|
| 6318 |
+
"num_tokens": 7765407.0,
|
| 6319 |
+
"step": 7010
|
| 6320 |
+
},
|
| 6321 |
+
{
|
| 6322 |
+
"epoch": 1.4144670562159984,
|
| 6323 |
+
"grad_norm": 11.0,
|
| 6324 |
+
"learning_rate": 1.0571562898784338e-05,
|
| 6325 |
+
"loss": 0.7867,
|
| 6326 |
+
"mean_token_accuracy": 0.804653775691986,
|
| 6327 |
+
"num_tokens": 7777508.0,
|
| 6328 |
+
"step": 7020
|
| 6329 |
+
},
|
| 6330 |
+
{
|
| 6331 |
+
"epoch": 1.4164819665524884,
|
| 6332 |
+
"grad_norm": 12.5,
|
| 6333 |
+
"learning_rate": 1.0558130163207738e-05,
|
| 6334 |
+
"loss": 0.9272,
|
| 6335 |
+
"mean_token_accuracy": 0.7761716663837432,
|
| 6336 |
+
"num_tokens": 7789199.0,
|
| 6337 |
+
"step": 7030
|
| 6338 |
+
},
|
| 6339 |
+
{
|
| 6340 |
+
"epoch": 1.4184968768889785,
|
| 6341 |
+
"grad_norm": 12.4375,
|
| 6342 |
+
"learning_rate": 1.0544697427631139e-05,
|
| 6343 |
+
"loss": 0.8449,
|
| 6344 |
+
"mean_token_accuracy": 0.7917571127414703,
|
| 6345 |
+
"num_tokens": 7800709.0,
|
| 6346 |
+
"step": 7040
|
| 6347 |
+
},
|
| 6348 |
+
{
|
| 6349 |
+
"epoch": 1.4205117872254684,
|
| 6350 |
+
"grad_norm": 13.0,
|
| 6351 |
+
"learning_rate": 1.0531264692054538e-05,
|
| 6352 |
+
"loss": 0.9143,
|
| 6353 |
+
"mean_token_accuracy": 0.7804217040538788,
|
| 6354 |
+
"num_tokens": 7813083.0,
|
| 6355 |
+
"step": 7050
|
| 6356 |
+
},
|
| 6357 |
+
{
|
| 6358 |
+
"epoch": 1.4225266975619584,
|
| 6359 |
+
"grad_norm": 11.875,
|
| 6360 |
+
"learning_rate": 1.0517831956477938e-05,
|
| 6361 |
+
"loss": 0.8094,
|
| 6362 |
+
"mean_token_accuracy": 0.8005238711833954,
|
| 6363 |
+
"num_tokens": 7823896.0,
|
| 6364 |
+
"step": 7060
|
| 6365 |
+
},
|
| 6366 |
+
{
|
| 6367 |
+
"epoch": 1.4245416078984485,
|
| 6368 |
+
"grad_norm": 11.9375,
|
| 6369 |
+
"learning_rate": 1.0504399220901339e-05,
|
| 6370 |
+
"loss": 0.773,
|
| 6371 |
+
"mean_token_accuracy": 0.8118914902210236,
|
| 6372 |
+
"num_tokens": 7834242.0,
|
| 6373 |
+
"step": 7070
|
| 6374 |
+
},
|
| 6375 |
+
{
|
| 6376 |
+
"epoch": 1.4265565182349387,
|
| 6377 |
+
"grad_norm": 10.0625,
|
| 6378 |
+
"learning_rate": 1.0490966485324736e-05,
|
| 6379 |
+
"loss": 0.8299,
|
| 6380 |
+
"mean_token_accuracy": 0.7923681199550628,
|
| 6381 |
+
"num_tokens": 7845550.0,
|
| 6382 |
+
"step": 7080
|
| 6383 |
+
},
|
| 6384 |
+
{
|
| 6385 |
+
"epoch": 1.4285714285714286,
|
| 6386 |
+
"grad_norm": 12.6875,
|
| 6387 |
+
"learning_rate": 1.0477533749748136e-05,
|
| 6388 |
+
"loss": 0.8102,
|
| 6389 |
+
"mean_token_accuracy": 0.7967373371124268,
|
| 6390 |
+
"num_tokens": 7856319.0,
|
| 6391 |
+
"step": 7090
|
| 6392 |
+
},
|
| 6393 |
+
{
|
| 6394 |
+
"epoch": 1.4305863389079185,
|
| 6395 |
+
"grad_norm": 11.9375,
|
| 6396 |
+
"learning_rate": 1.0464101014171537e-05,
|
| 6397 |
+
"loss": 0.9132,
|
| 6398 |
+
"mean_token_accuracy": 0.7812554478645325,
|
| 6399 |
+
"num_tokens": 7867021.0,
|
| 6400 |
+
"step": 7100
|
| 6401 |
+
},
|
| 6402 |
+
{
|
| 6403 |
+
"epoch": 1.4326012492444087,
|
| 6404 |
+
"grad_norm": 10.0625,
|
| 6405 |
+
"learning_rate": 1.0450668278594937e-05,
|
| 6406 |
+
"loss": 0.7123,
|
| 6407 |
+
"mean_token_accuracy": 0.8167718529701233,
|
| 6408 |
+
"num_tokens": 7877634.0,
|
| 6409 |
+
"step": 7110
|
| 6410 |
+
},
|
| 6411 |
+
{
|
| 6412 |
+
"epoch": 1.4346161595808986,
|
| 6413 |
+
"grad_norm": 11.0625,
|
| 6414 |
+
"learning_rate": 1.0437235543018336e-05,
|
| 6415 |
+
"loss": 0.8817,
|
| 6416 |
+
"mean_token_accuracy": 0.7886090099811554,
|
| 6417 |
+
"num_tokens": 7889170.0,
|
| 6418 |
+
"step": 7120
|
| 6419 |
+
},
|
| 6420 |
+
{
|
| 6421 |
+
"epoch": 1.4366310699173888,
|
| 6422 |
+
"grad_norm": 12.1875,
|
| 6423 |
+
"learning_rate": 1.0423802807441737e-05,
|
| 6424 |
+
"loss": 0.8589,
|
| 6425 |
+
"mean_token_accuracy": 0.7861130595207214,
|
| 6426 |
+
"num_tokens": 7899457.0,
|
| 6427 |
+
"step": 7130
|
| 6428 |
+
},
|
| 6429 |
+
{
|
| 6430 |
+
"epoch": 1.4386459802538787,
|
| 6431 |
+
"grad_norm": 13.0625,
|
| 6432 |
+
"learning_rate": 1.0410370071865137e-05,
|
| 6433 |
+
"loss": 0.9932,
|
| 6434 |
+
"mean_token_accuracy": 0.7648563742637634,
|
| 6435 |
+
"num_tokens": 7910678.0,
|
| 6436 |
+
"step": 7140
|
| 6437 |
+
},
|
| 6438 |
+
{
|
| 6439 |
+
"epoch": 1.4406608905903688,
|
| 6440 |
+
"grad_norm": 10.0,
|
| 6441 |
+
"learning_rate": 1.0396937336288534e-05,
|
| 6442 |
+
"loss": 0.9069,
|
| 6443 |
+
"mean_token_accuracy": 0.7735403776168823,
|
| 6444 |
+
"num_tokens": 7922908.0,
|
| 6445 |
+
"step": 7150
|
| 6446 |
+
},
|
| 6447 |
+
{
|
| 6448 |
+
"epoch": 1.4426758009268588,
|
| 6449 |
+
"grad_norm": 12.875,
|
| 6450 |
+
"learning_rate": 1.0383504600711935e-05,
|
| 6451 |
+
"loss": 0.8557,
|
| 6452 |
+
"mean_token_accuracy": 0.7988546848297119,
|
| 6453 |
+
"num_tokens": 7933362.0,
|
| 6454 |
+
"step": 7160
|
| 6455 |
+
},
|
| 6456 |
+
{
|
| 6457 |
+
"epoch": 1.4446907112633487,
|
| 6458 |
+
"grad_norm": 12.625,
|
| 6459 |
+
"learning_rate": 1.0370071865135335e-05,
|
| 6460 |
+
"loss": 0.8692,
|
| 6461 |
+
"mean_token_accuracy": 0.786381047964096,
|
| 6462 |
+
"num_tokens": 7944561.0,
|
| 6463 |
+
"step": 7170
|
| 6464 |
+
},
|
| 6465 |
+
{
|
| 6466 |
+
"epoch": 1.4467056215998388,
|
| 6467 |
+
"grad_norm": 12.6875,
|
| 6468 |
+
"learning_rate": 1.0356639129558736e-05,
|
| 6469 |
+
"loss": 0.9052,
|
| 6470 |
+
"mean_token_accuracy": 0.7769907891750336,
|
| 6471 |
+
"num_tokens": 7955497.0,
|
| 6472 |
+
"step": 7180
|
| 6473 |
+
},
|
| 6474 |
+
{
|
| 6475 |
+
"epoch": 1.4487205319363288,
|
| 6476 |
+
"grad_norm": 11.8125,
|
| 6477 |
+
"learning_rate": 1.0343206393982135e-05,
|
| 6478 |
+
"loss": 0.8062,
|
| 6479 |
+
"mean_token_accuracy": 0.7997641444206238,
|
| 6480 |
+
"num_tokens": 7966168.0,
|
| 6481 |
+
"step": 7190
|
| 6482 |
+
},
|
| 6483 |
+
{
|
| 6484 |
+
"epoch": 1.450735442272819,
|
| 6485 |
+
"grad_norm": 12.5,
|
| 6486 |
+
"learning_rate": 1.0329773658405535e-05,
|
| 6487 |
+
"loss": 0.8286,
|
| 6488 |
+
"mean_token_accuracy": 0.8001461684703827,
|
| 6489 |
+
"num_tokens": 7977058.0,
|
| 6490 |
+
"step": 7200
|
| 6491 |
+
},
|
| 6492 |
+
{
|
| 6493 |
+
"epoch": 1.4527503526093088,
|
| 6494 |
+
"grad_norm": 11.25,
|
| 6495 |
+
"learning_rate": 1.0316340922828936e-05,
|
| 6496 |
+
"loss": 0.8228,
|
| 6497 |
+
"mean_token_accuracy": 0.7994490921497345,
|
| 6498 |
+
"num_tokens": 7987859.0,
|
| 6499 |
+
"step": 7210
|
| 6500 |
+
},
|
| 6501 |
+
{
|
| 6502 |
+
"epoch": 1.454765262945799,
|
| 6503 |
+
"grad_norm": 10.8125,
|
| 6504 |
+
"learning_rate": 1.0302908187252335e-05,
|
| 6505 |
+
"loss": 0.8172,
|
| 6506 |
+
"mean_token_accuracy": 0.7964129328727723,
|
| 6507 |
+
"num_tokens": 7999424.0,
|
| 6508 |
+
"step": 7220
|
| 6509 |
+
},
|
| 6510 |
+
{
|
| 6511 |
+
"epoch": 1.456780173282289,
|
| 6512 |
+
"grad_norm": 12.3125,
|
| 6513 |
+
"learning_rate": 1.0289475451675735e-05,
|
| 6514 |
+
"loss": 0.8538,
|
| 6515 |
+
"mean_token_accuracy": 0.7890827238559723,
|
| 6516 |
+
"num_tokens": 8011181.0,
|
| 6517 |
+
"step": 7230
|
| 6518 |
+
},
|
| 6519 |
+
{
|
| 6520 |
+
"epoch": 1.4587950836187789,
|
| 6521 |
+
"grad_norm": 12.9375,
|
| 6522 |
+
"learning_rate": 1.0276042716099136e-05,
|
| 6523 |
+
"loss": 0.9005,
|
| 6524 |
+
"mean_token_accuracy": 0.7845316469669342,
|
| 6525 |
+
"num_tokens": 8021786.0,
|
| 6526 |
+
"step": 7240
|
| 6527 |
+
},
|
| 6528 |
+
{
|
| 6529 |
+
"epoch": 1.460809993955269,
|
| 6530 |
+
"grad_norm": 12.0,
|
| 6531 |
+
"learning_rate": 1.0262609980522533e-05,
|
| 6532 |
+
"loss": 0.8514,
|
| 6533 |
+
"mean_token_accuracy": 0.7937814593315125,
|
| 6534 |
+
"num_tokens": 8033599.0,
|
| 6535 |
+
"step": 7250
|
| 6536 |
+
},
|
| 6537 |
+
{
|
| 6538 |
+
"epoch": 1.4628249042917592,
|
| 6539 |
+
"grad_norm": 13.8125,
|
| 6540 |
+
"learning_rate": 1.0249177244945933e-05,
|
| 6541 |
+
"loss": 0.9692,
|
| 6542 |
+
"mean_token_accuracy": 0.768017840385437,
|
| 6543 |
+
"num_tokens": 8044948.0,
|
| 6544 |
+
"step": 7260
|
| 6545 |
+
},
|
| 6546 |
+
{
|
| 6547 |
+
"epoch": 1.464839814628249,
|
| 6548 |
+
"grad_norm": 10.0625,
|
| 6549 |
+
"learning_rate": 1.0235744509369334e-05,
|
| 6550 |
+
"loss": 0.8586,
|
| 6551 |
+
"mean_token_accuracy": 0.7864105820655822,
|
| 6552 |
+
"num_tokens": 8056601.0,
|
| 6553 |
+
"step": 7270
|
| 6554 |
+
},
|
| 6555 |
+
{
|
| 6556 |
+
"epoch": 1.466854724964739,
|
| 6557 |
+
"grad_norm": 10.875,
|
| 6558 |
+
"learning_rate": 1.0222311773792735e-05,
|
| 6559 |
+
"loss": 0.7389,
|
| 6560 |
+
"mean_token_accuracy": 0.8095929026603699,
|
| 6561 |
+
"num_tokens": 8067564.0,
|
| 6562 |
+
"step": 7280
|
| 6563 |
+
},
|
| 6564 |
+
{
|
| 6565 |
+
"epoch": 1.4688696353012292,
|
| 6566 |
+
"grad_norm": 10.75,
|
| 6567 |
+
"learning_rate": 1.0208879038216133e-05,
|
| 6568 |
+
"loss": 0.829,
|
| 6569 |
+
"mean_token_accuracy": 0.7980533838272095,
|
| 6570 |
+
"num_tokens": 8077900.0,
|
| 6571 |
+
"step": 7290
|
| 6572 |
+
},
|
| 6573 |
+
{
|
| 6574 |
+
"epoch": 1.470884545637719,
|
| 6575 |
+
"grad_norm": 11.5625,
|
| 6576 |
+
"learning_rate": 1.0195446302639534e-05,
|
| 6577 |
+
"loss": 0.783,
|
| 6578 |
+
"mean_token_accuracy": 0.8065039277076721,
|
| 6579 |
+
"num_tokens": 8088501.0,
|
| 6580 |
+
"step": 7300
|
| 6581 |
+
},
|
| 6582 |
+
{
|
| 6583 |
+
"epoch": 1.472899455974209,
|
| 6584 |
+
"grad_norm": 10.375,
|
| 6585 |
+
"learning_rate": 1.0182013567062934e-05,
|
| 6586 |
+
"loss": 0.7907,
|
| 6587 |
+
"mean_token_accuracy": 0.7920153796672821,
|
| 6588 |
+
"num_tokens": 8099942.0,
|
| 6589 |
+
"step": 7310
|
| 6590 |
+
},
|
| 6591 |
+
{
|
| 6592 |
+
"epoch": 1.4749143663106992,
|
| 6593 |
+
"grad_norm": 11.125,
|
| 6594 |
+
"learning_rate": 1.0168580831486332e-05,
|
| 6595 |
+
"loss": 0.8727,
|
| 6596 |
+
"mean_token_accuracy": 0.7863239705562591,
|
| 6597 |
+
"num_tokens": 8111532.0,
|
| 6598 |
+
"step": 7320
|
| 6599 |
+
},
|
| 6600 |
+
{
|
| 6601 |
+
"epoch": 1.4769292766471893,
|
| 6602 |
+
"grad_norm": 12.6875,
|
| 6603 |
+
"learning_rate": 1.0155148095909732e-05,
|
| 6604 |
+
"loss": 0.798,
|
| 6605 |
+
"mean_token_accuracy": 0.8027134239673615,
|
| 6606 |
+
"num_tokens": 8122335.0,
|
| 6607 |
+
"step": 7330
|
| 6608 |
+
},
|
| 6609 |
+
{
|
| 6610 |
+
"epoch": 1.4789441869836792,
|
| 6611 |
+
"grad_norm": 13.9375,
|
| 6612 |
+
"learning_rate": 1.0141715360333133e-05,
|
| 6613 |
+
"loss": 0.7377,
|
| 6614 |
+
"mean_token_accuracy": 0.8082942187786102,
|
| 6615 |
+
"num_tokens": 8132604.0,
|
| 6616 |
+
"step": 7340
|
| 6617 |
+
},
|
| 6618 |
+
{
|
| 6619 |
+
"epoch": 1.4809590973201692,
|
| 6620 |
+
"grad_norm": 11.5,
|
| 6621 |
+
"learning_rate": 1.0128282624756533e-05,
|
| 6622 |
+
"loss": 0.8337,
|
| 6623 |
+
"mean_token_accuracy": 0.7979135930538177,
|
| 6624 |
+
"num_tokens": 8143891.0,
|
| 6625 |
+
"step": 7350
|
| 6626 |
+
},
|
| 6627 |
+
{
|
| 6628 |
+
"epoch": 1.4829740076566593,
|
| 6629 |
+
"grad_norm": 13.375,
|
| 6630 |
+
"learning_rate": 1.0114849889179932e-05,
|
| 6631 |
+
"loss": 0.9091,
|
| 6632 |
+
"mean_token_accuracy": 0.7805217266082763,
|
| 6633 |
+
"num_tokens": 8154184.0,
|
| 6634 |
+
"step": 7360
|
| 6635 |
+
},
|
| 6636 |
+
{
|
| 6637 |
+
"epoch": 1.4849889179931492,
|
| 6638 |
+
"grad_norm": 9.875,
|
| 6639 |
+
"learning_rate": 1.0101417153603332e-05,
|
| 6640 |
+
"loss": 0.8451,
|
| 6641 |
+
"mean_token_accuracy": 0.7925164818763732,
|
| 6642 |
+
"num_tokens": 8165049.0,
|
| 6643 |
+
"step": 7370
|
| 6644 |
+
},
|
| 6645 |
+
{
|
| 6646 |
+
"epoch": 1.4870038283296394,
|
| 6647 |
+
"grad_norm": 12.0625,
|
| 6648 |
+
"learning_rate": 1.0087984418026733e-05,
|
| 6649 |
+
"loss": 0.8572,
|
| 6650 |
+
"mean_token_accuracy": 0.7849507808685303,
|
| 6651 |
+
"num_tokens": 8177037.0,
|
| 6652 |
+
"step": 7380
|
| 6653 |
+
},
|
| 6654 |
+
{
|
| 6655 |
+
"epoch": 1.4890187386661293,
|
| 6656 |
+
"grad_norm": 10.875,
|
| 6657 |
+
"learning_rate": 1.0074551682450132e-05,
|
| 6658 |
+
"loss": 0.8239,
|
| 6659 |
+
"mean_token_accuracy": 0.795056939125061,
|
| 6660 |
+
"num_tokens": 8187440.0,
|
| 6661 |
+
"step": 7390
|
| 6662 |
+
},
|
| 6663 |
+
{
|
| 6664 |
+
"epoch": 1.4910336490026195,
|
| 6665 |
+
"grad_norm": 10.0,
|
| 6666 |
+
"learning_rate": 1.006111894687353e-05,
|
| 6667 |
+
"loss": 0.8283,
|
| 6668 |
+
"mean_token_accuracy": 0.7943599224090576,
|
| 6669 |
+
"num_tokens": 8199890.0,
|
| 6670 |
+
"step": 7400
|
| 6671 |
+
},
|
| 6672 |
+
{
|
| 6673 |
+
"epoch": 1.4930485593391094,
|
| 6674 |
+
"grad_norm": 10.5625,
|
| 6675 |
+
"learning_rate": 1.0047686211296931e-05,
|
| 6676 |
+
"loss": 0.8196,
|
| 6677 |
+
"mean_token_accuracy": 0.7991042912006379,
|
| 6678 |
+
"num_tokens": 8211416.0,
|
| 6679 |
+
"step": 7410
|
| 6680 |
+
},
|
| 6681 |
+
{
|
| 6682 |
+
"epoch": 1.4950634696755993,
|
| 6683 |
+
"grad_norm": 15.125,
|
| 6684 |
+
"learning_rate": 1.003425347572033e-05,
|
| 6685 |
+
"loss": 0.7576,
|
| 6686 |
+
"mean_token_accuracy": 0.8098958432674408,
|
| 6687 |
+
"num_tokens": 8221936.0,
|
| 6688 |
+
"step": 7420
|
| 6689 |
+
},
|
| 6690 |
+
{
|
| 6691 |
+
"epoch": 1.4970783800120895,
|
| 6692 |
+
"grad_norm": 10.6875,
|
| 6693 |
+
"learning_rate": 1.002082074014373e-05,
|
| 6694 |
+
"loss": 0.7949,
|
| 6695 |
+
"mean_token_accuracy": 0.803859144449234,
|
| 6696 |
+
"num_tokens": 8232684.0,
|
| 6697 |
+
"step": 7430
|
| 6698 |
+
},
|
| 6699 |
+
{
|
| 6700 |
+
"epoch": 1.4990932903485794,
|
| 6701 |
+
"grad_norm": 12.5,
|
| 6702 |
+
"learning_rate": 1.0007388004567131e-05,
|
| 6703 |
+
"loss": 0.8918,
|
| 6704 |
+
"mean_token_accuracy": 0.7786654233932495,
|
| 6705 |
+
"num_tokens": 8244164.0,
|
| 6706 |
+
"step": 7440
|
| 6707 |
+
},
|
| 6708 |
+
{
|
| 6709 |
+
"epoch": 1.5011082006850696,
|
| 6710 |
+
"grad_norm": 11.4375,
|
| 6711 |
+
"learning_rate": 9.99395526899053e-06,
|
| 6712 |
+
"loss": 0.9013,
|
| 6713 |
+
"mean_token_accuracy": 0.7826810419559479,
|
| 6714 |
+
"num_tokens": 8255935.0,
|
| 6715 |
+
"step": 7450
|
| 6716 |
+
},
|
| 6717 |
+
{
|
| 6718 |
+
"epoch": 1.5031231110215595,
|
| 6719 |
+
"grad_norm": 11.4375,
|
| 6720 |
+
"learning_rate": 9.98052253341393e-06,
|
| 6721 |
+
"loss": 0.7827,
|
| 6722 |
+
"mean_token_accuracy": 0.8083594501018524,
|
| 6723 |
+
"num_tokens": 8267114.0,
|
| 6724 |
+
"step": 7460
|
| 6725 |
+
},
|
| 6726 |
+
{
|
| 6727 |
+
"epoch": 1.5051380213580496,
|
| 6728 |
+
"grad_norm": 12.0,
|
| 6729 |
+
"learning_rate": 9.967089797837331e-06,
|
| 6730 |
+
"loss": 0.7499,
|
| 6731 |
+
"mean_token_accuracy": 0.8060566544532776,
|
| 6732 |
+
"num_tokens": 8277828.0,
|
| 6733 |
+
"step": 7470
|
| 6734 |
+
},
|
| 6735 |
+
{
|
| 6736 |
+
"epoch": 1.5071529316945396,
|
| 6737 |
+
"grad_norm": 12.375,
|
| 6738 |
+
"learning_rate": 9.95365706226073e-06,
|
| 6739 |
+
"loss": 0.8611,
|
| 6740 |
+
"mean_token_accuracy": 0.7861000895500183,
|
| 6741 |
+
"num_tokens": 8289857.0,
|
| 6742 |
+
"step": 7480
|
| 6743 |
+
},
|
| 6744 |
+
{
|
| 6745 |
+
"epoch": 1.5091678420310295,
|
| 6746 |
+
"grad_norm": 12.125,
|
| 6747 |
+
"learning_rate": 9.94022432668413e-06,
|
| 6748 |
+
"loss": 0.8669,
|
| 6749 |
+
"mean_token_accuracy": 0.7892852067947388,
|
| 6750 |
+
"num_tokens": 8300286.0,
|
| 6751 |
+
"step": 7490
|
| 6752 |
+
},
|
| 6753 |
+
{
|
| 6754 |
+
"epoch": 1.5111827523675196,
|
| 6755 |
+
"grad_norm": 10.8125,
|
| 6756 |
+
"learning_rate": 9.92679159110753e-06,
|
| 6757 |
+
"loss": 0.7735,
|
| 6758 |
+
"mean_token_accuracy": 0.8061196208000183,
|
| 6759 |
+
"num_tokens": 8312344.0,
|
| 6760 |
+
"step": 7500
|
| 6761 |
}
|
| 6762 |
],
|
| 6763 |
"logging_steps": 10,
|
|
|
|
| 6777 |
"attributes": {}
|
| 6778 |
}
|
| 6779 |
},
|
| 6780 |
+
"total_flos": 1.006244257019904e+16,
|
| 6781 |
"train_batch_size": 8,
|
| 6782 |
"trial_name": null,
|
| 6783 |
"trial_params": null
|