Training in progress, step 8500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:16ca55f673b3ad7e95262a9d0296f5d8f2b7edb92a87d108841a282630107b61
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d62e7293944847d060dba9b65b5eb64216e79d1a688e81c3a95ed8977d7ce35
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3aeebf16be5d93156c95c5c47fce9ca30893837ac7097fcc26a2ec8d4dc9f51
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -7208,6 +7208,456 @@
|
|
| 7208 |
"mean_token_accuracy": 0.7778131783008575,
|
| 7209 |
"num_tokens": 8860114.0,
|
| 7210 |
"step": 8000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7211 |
}
|
| 7212 |
],
|
| 7213 |
"logging_steps": 10,
|
|
@@ -7227,7 +7677,7 @@
|
|
| 7227 |
"attributes": {}
|
| 7228 |
}
|
| 7229 |
},
|
| 7230 |
-
"total_flos": 1.
|
| 7231 |
"train_batch_size": 8,
|
| 7232 |
"trial_name": null,
|
| 7233 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.7126737860165222,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 8500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 7208 |
"mean_token_accuracy": 0.7778131783008575,
|
| 7209 |
"num_tokens": 8860114.0,
|
| 7210 |
"step": 8000
|
| 7211 |
+
},
|
| 7212 |
+
{
|
| 7213 |
+
"epoch": 1.613943179528511,
|
| 7214 |
+
"grad_norm": 12.875,
|
| 7215 |
+
"learning_rate": 9.24172207670092e-06,
|
| 7216 |
+
"loss": 0.9062,
|
| 7217 |
+
"mean_token_accuracy": 0.7869289875030517,
|
| 7218 |
+
"num_tokens": 8870296.0,
|
| 7219 |
+
"step": 8010
|
| 7220 |
+
},
|
| 7221 |
+
{
|
| 7222 |
+
"epoch": 1.615958089865001,
|
| 7223 |
+
"grad_norm": 11.0625,
|
| 7224 |
+
"learning_rate": 9.22828934112432e-06,
|
| 7225 |
+
"loss": 0.7193,
|
| 7226 |
+
"mean_token_accuracy": 0.8147738158702851,
|
| 7227 |
+
"num_tokens": 8880957.0,
|
| 7228 |
+
"step": 8020
|
| 7229 |
+
},
|
| 7230 |
+
{
|
| 7231 |
+
"epoch": 1.617973000201491,
|
| 7232 |
+
"grad_norm": 11.625,
|
| 7233 |
+
"learning_rate": 9.21485660554772e-06,
|
| 7234 |
+
"loss": 0.9484,
|
| 7235 |
+
"mean_token_accuracy": 0.7725982308387757,
|
| 7236 |
+
"num_tokens": 8892907.0,
|
| 7237 |
+
"step": 8030
|
| 7238 |
+
},
|
| 7239 |
+
{
|
| 7240 |
+
"epoch": 1.619987910537981,
|
| 7241 |
+
"grad_norm": 11.0625,
|
| 7242 |
+
"learning_rate": 9.20142386997112e-06,
|
| 7243 |
+
"loss": 0.7605,
|
| 7244 |
+
"mean_token_accuracy": 0.8034783184528351,
|
| 7245 |
+
"num_tokens": 8904461.0,
|
| 7246 |
+
"step": 8040
|
| 7247 |
+
},
|
| 7248 |
+
{
|
| 7249 |
+
"epoch": 1.6220028208744712,
|
| 7250 |
+
"grad_norm": 11.8125,
|
| 7251 |
+
"learning_rate": 9.187991134394521e-06,
|
| 7252 |
+
"loss": 0.8351,
|
| 7253 |
+
"mean_token_accuracy": 0.7964996695518494,
|
| 7254 |
+
"num_tokens": 8915489.0,
|
| 7255 |
+
"step": 8050
|
| 7256 |
+
},
|
| 7257 |
+
{
|
| 7258 |
+
"epoch": 1.6240177312109612,
|
| 7259 |
+
"grad_norm": 10.625,
|
| 7260 |
+
"learning_rate": 9.17455839881792e-06,
|
| 7261 |
+
"loss": 0.8234,
|
| 7262 |
+
"mean_token_accuracy": 0.7960014402866363,
|
| 7263 |
+
"num_tokens": 8927916.0,
|
| 7264 |
+
"step": 8060
|
| 7265 |
+
},
|
| 7266 |
+
{
|
| 7267 |
+
"epoch": 1.626032641547451,
|
| 7268 |
+
"grad_norm": 11.0,
|
| 7269 |
+
"learning_rate": 9.16112566324132e-06,
|
| 7270 |
+
"loss": 0.8144,
|
| 7271 |
+
"mean_token_accuracy": 0.7940633356571197,
|
| 7272 |
+
"num_tokens": 8938931.0,
|
| 7273 |
+
"step": 8070
|
| 7274 |
+
},
|
| 7275 |
+
{
|
| 7276 |
+
"epoch": 1.6280475518839412,
|
| 7277 |
+
"grad_norm": 10.4375,
|
| 7278 |
+
"learning_rate": 9.14769292766472e-06,
|
| 7279 |
+
"loss": 0.7865,
|
| 7280 |
+
"mean_token_accuracy": 0.806914460659027,
|
| 7281 |
+
"num_tokens": 8949201.0,
|
| 7282 |
+
"step": 8080
|
| 7283 |
+
},
|
| 7284 |
+
{
|
| 7285 |
+
"epoch": 1.6300624622204312,
|
| 7286 |
+
"grad_norm": 14.4375,
|
| 7287 |
+
"learning_rate": 9.13426019208812e-06,
|
| 7288 |
+
"loss": 0.8536,
|
| 7289 |
+
"mean_token_accuracy": 0.787992262840271,
|
| 7290 |
+
"num_tokens": 8960524.0,
|
| 7291 |
+
"step": 8090
|
| 7292 |
+
},
|
| 7293 |
+
{
|
| 7294 |
+
"epoch": 1.632077372556921,
|
| 7295 |
+
"grad_norm": 9.75,
|
| 7296 |
+
"learning_rate": 9.12082745651152e-06,
|
| 7297 |
+
"loss": 0.8154,
|
| 7298 |
+
"mean_token_accuracy": 0.7935736238956451,
|
| 7299 |
+
"num_tokens": 8972192.0,
|
| 7300 |
+
"step": 8100
|
| 7301 |
+
},
|
| 7302 |
+
{
|
| 7303 |
+
"epoch": 1.6340922828934112,
|
| 7304 |
+
"grad_norm": 13.375,
|
| 7305 |
+
"learning_rate": 9.107394720934919e-06,
|
| 7306 |
+
"loss": 0.782,
|
| 7307 |
+
"mean_token_accuracy": 0.802595990896225,
|
| 7308 |
+
"num_tokens": 8983325.0,
|
| 7309 |
+
"step": 8110
|
| 7310 |
+
},
|
| 7311 |
+
{
|
| 7312 |
+
"epoch": 1.6361071932299014,
|
| 7313 |
+
"grad_norm": 14.875,
|
| 7314 |
+
"learning_rate": 9.09396198535832e-06,
|
| 7315 |
+
"loss": 0.8885,
|
| 7316 |
+
"mean_token_accuracy": 0.779900997877121,
|
| 7317 |
+
"num_tokens": 8994056.0,
|
| 7318 |
+
"step": 8120
|
| 7319 |
+
},
|
| 7320 |
+
{
|
| 7321 |
+
"epoch": 1.6381221035663913,
|
| 7322 |
+
"grad_norm": 12.375,
|
| 7323 |
+
"learning_rate": 9.080529249781718e-06,
|
| 7324 |
+
"loss": 0.7469,
|
| 7325 |
+
"mean_token_accuracy": 0.8087693631649018,
|
| 7326 |
+
"num_tokens": 9004649.0,
|
| 7327 |
+
"step": 8130
|
| 7328 |
+
},
|
| 7329 |
+
{
|
| 7330 |
+
"epoch": 1.6401370139028812,
|
| 7331 |
+
"grad_norm": 12.375,
|
| 7332 |
+
"learning_rate": 9.067096514205117e-06,
|
| 7333 |
+
"loss": 0.9413,
|
| 7334 |
+
"mean_token_accuracy": 0.7759244620800019,
|
| 7335 |
+
"num_tokens": 9017546.0,
|
| 7336 |
+
"step": 8140
|
| 7337 |
+
},
|
| 7338 |
+
{
|
| 7339 |
+
"epoch": 1.6421519242393714,
|
| 7340 |
+
"grad_norm": 11.4375,
|
| 7341 |
+
"learning_rate": 9.053663778628518e-06,
|
| 7342 |
+
"loss": 0.8086,
|
| 7343 |
+
"mean_token_accuracy": 0.7987602353096008,
|
| 7344 |
+
"num_tokens": 9028816.0,
|
| 7345 |
+
"step": 8150
|
| 7346 |
+
},
|
| 7347 |
+
{
|
| 7348 |
+
"epoch": 1.6441668345758613,
|
| 7349 |
+
"grad_norm": 13.0,
|
| 7350 |
+
"learning_rate": 9.040231043051918e-06,
|
| 7351 |
+
"loss": 0.8092,
|
| 7352 |
+
"mean_token_accuracy": 0.8002450168132782,
|
| 7353 |
+
"num_tokens": 9040241.0,
|
| 7354 |
+
"step": 8160
|
| 7355 |
+
},
|
| 7356 |
+
{
|
| 7357 |
+
"epoch": 1.6461817449123513,
|
| 7358 |
+
"grad_norm": 11.875,
|
| 7359 |
+
"learning_rate": 9.026798307475319e-06,
|
| 7360 |
+
"loss": 0.8056,
|
| 7361 |
+
"mean_token_accuracy": 0.7970556557178498,
|
| 7362 |
+
"num_tokens": 9050944.0,
|
| 7363 |
+
"step": 8170
|
| 7364 |
+
},
|
| 7365 |
+
{
|
| 7366 |
+
"epoch": 1.6481966552488414,
|
| 7367 |
+
"grad_norm": 11.25,
|
| 7368 |
+
"learning_rate": 9.013365571898718e-06,
|
| 7369 |
+
"loss": 0.885,
|
| 7370 |
+
"mean_token_accuracy": 0.7855951130390167,
|
| 7371 |
+
"num_tokens": 9062338.0,
|
| 7372 |
+
"step": 8180
|
| 7373 |
+
},
|
| 7374 |
+
{
|
| 7375 |
+
"epoch": 1.6502115655853316,
|
| 7376 |
+
"grad_norm": 12.0625,
|
| 7377 |
+
"learning_rate": 8.999932836322117e-06,
|
| 7378 |
+
"loss": 0.8499,
|
| 7379 |
+
"mean_token_accuracy": 0.7869492530822754,
|
| 7380 |
+
"num_tokens": 9073217.0,
|
| 7381 |
+
"step": 8190
|
| 7382 |
+
},
|
| 7383 |
+
{
|
| 7384 |
+
"epoch": 1.6522264759218215,
|
| 7385 |
+
"grad_norm": 10.625,
|
| 7386 |
+
"learning_rate": 8.986500100745517e-06,
|
| 7387 |
+
"loss": 0.725,
|
| 7388 |
+
"mean_token_accuracy": 0.816201251745224,
|
| 7389 |
+
"num_tokens": 9084540.0,
|
| 7390 |
+
"step": 8200
|
| 7391 |
+
},
|
| 7392 |
+
{
|
| 7393 |
+
"epoch": 1.6542413862583114,
|
| 7394 |
+
"grad_norm": 14.625,
|
| 7395 |
+
"learning_rate": 8.973067365168918e-06,
|
| 7396 |
+
"loss": 0.8659,
|
| 7397 |
+
"mean_token_accuracy": 0.7861015915870666,
|
| 7398 |
+
"num_tokens": 9095828.0,
|
| 7399 |
+
"step": 8210
|
| 7400 |
+
},
|
| 7401 |
+
{
|
| 7402 |
+
"epoch": 1.6562562965948016,
|
| 7403 |
+
"grad_norm": 11.375,
|
| 7404 |
+
"learning_rate": 8.959634629592318e-06,
|
| 7405 |
+
"loss": 0.8543,
|
| 7406 |
+
"mean_token_accuracy": 0.7877636075019836,
|
| 7407 |
+
"num_tokens": 9105269.0,
|
| 7408 |
+
"step": 8220
|
| 7409 |
+
},
|
| 7410 |
+
{
|
| 7411 |
+
"epoch": 1.6582712069312917,
|
| 7412 |
+
"grad_norm": 11.8125,
|
| 7413 |
+
"learning_rate": 8.946201894015717e-06,
|
| 7414 |
+
"loss": 0.7655,
|
| 7415 |
+
"mean_token_accuracy": 0.8078620612621308,
|
| 7416 |
+
"num_tokens": 9115722.0,
|
| 7417 |
+
"step": 8230
|
| 7418 |
+
},
|
| 7419 |
+
{
|
| 7420 |
+
"epoch": 1.6602861172677816,
|
| 7421 |
+
"grad_norm": 12.1875,
|
| 7422 |
+
"learning_rate": 8.932769158439118e-06,
|
| 7423 |
+
"loss": 0.8754,
|
| 7424 |
+
"mean_token_accuracy": 0.7780845940113068,
|
| 7425 |
+
"num_tokens": 9126931.0,
|
| 7426 |
+
"step": 8240
|
| 7427 |
+
},
|
| 7428 |
+
{
|
| 7429 |
+
"epoch": 1.6623010276042716,
|
| 7430 |
+
"grad_norm": 14.0625,
|
| 7431 |
+
"learning_rate": 8.919336422862516e-06,
|
| 7432 |
+
"loss": 0.8186,
|
| 7433 |
+
"mean_token_accuracy": 0.7980137884616851,
|
| 7434 |
+
"num_tokens": 9137118.0,
|
| 7435 |
+
"step": 8250
|
| 7436 |
+
},
|
| 7437 |
+
{
|
| 7438 |
+
"epoch": 1.6643159379407617,
|
| 7439 |
+
"grad_norm": 11.3125,
|
| 7440 |
+
"learning_rate": 8.905903687285917e-06,
|
| 7441 |
+
"loss": 0.8233,
|
| 7442 |
+
"mean_token_accuracy": 0.7963060855865478,
|
| 7443 |
+
"num_tokens": 9148350.0,
|
| 7444 |
+
"step": 8260
|
| 7445 |
+
},
|
| 7446 |
+
{
|
| 7447 |
+
"epoch": 1.6663308482772516,
|
| 7448 |
+
"grad_norm": 12.25,
|
| 7449 |
+
"learning_rate": 8.892470951709317e-06,
|
| 7450 |
+
"loss": 0.8238,
|
| 7451 |
+
"mean_token_accuracy": 0.8000846326351165,
|
| 7452 |
+
"num_tokens": 9159385.0,
|
| 7453 |
+
"step": 8270
|
| 7454 |
+
},
|
| 7455 |
+
{
|
| 7456 |
+
"epoch": 1.6683457586137416,
|
| 7457 |
+
"grad_norm": 11.0,
|
| 7458 |
+
"learning_rate": 8.879038216132716e-06,
|
| 7459 |
+
"loss": 0.8972,
|
| 7460 |
+
"mean_token_accuracy": 0.7827898025512695,
|
| 7461 |
+
"num_tokens": 9170903.0,
|
| 7462 |
+
"step": 8280
|
| 7463 |
+
},
|
| 7464 |
+
{
|
| 7465 |
+
"epoch": 1.6703606689502317,
|
| 7466 |
+
"grad_norm": 11.625,
|
| 7467 |
+
"learning_rate": 8.865605480556117e-06,
|
| 7468 |
+
"loss": 0.7794,
|
| 7469 |
+
"mean_token_accuracy": 0.8025726079940796,
|
| 7470 |
+
"num_tokens": 9181737.0,
|
| 7471 |
+
"step": 8290
|
| 7472 |
+
},
|
| 7473 |
+
{
|
| 7474 |
+
"epoch": 1.6723755792867219,
|
| 7475 |
+
"grad_norm": 10.75,
|
| 7476 |
+
"learning_rate": 8.852172744979516e-06,
|
| 7477 |
+
"loss": 0.7777,
|
| 7478 |
+
"mean_token_accuracy": 0.8053012132644654,
|
| 7479 |
+
"num_tokens": 9193448.0,
|
| 7480 |
+
"step": 8300
|
| 7481 |
+
},
|
| 7482 |
+
{
|
| 7483 |
+
"epoch": 1.6743904896232118,
|
| 7484 |
+
"grad_norm": 11.4375,
|
| 7485 |
+
"learning_rate": 8.838740009402914e-06,
|
| 7486 |
+
"loss": 0.7605,
|
| 7487 |
+
"mean_token_accuracy": 0.8079525053501129,
|
| 7488 |
+
"num_tokens": 9204071.0,
|
| 7489 |
+
"step": 8310
|
| 7490 |
+
},
|
| 7491 |
+
{
|
| 7492 |
+
"epoch": 1.6764053999597017,
|
| 7493 |
+
"grad_norm": 11.5,
|
| 7494 |
+
"learning_rate": 8.825307273826315e-06,
|
| 7495 |
+
"loss": 0.9171,
|
| 7496 |
+
"mean_token_accuracy": 0.7748861670494079,
|
| 7497 |
+
"num_tokens": 9214504.0,
|
| 7498 |
+
"step": 8320
|
| 7499 |
+
},
|
| 7500 |
+
{
|
| 7501 |
+
"epoch": 1.6784203102961919,
|
| 7502 |
+
"grad_norm": 9.8125,
|
| 7503 |
+
"learning_rate": 8.811874538249716e-06,
|
| 7504 |
+
"loss": 0.8916,
|
| 7505 |
+
"mean_token_accuracy": 0.7793697714805603,
|
| 7506 |
+
"num_tokens": 9226284.0,
|
| 7507 |
+
"step": 8330
|
| 7508 |
+
},
|
| 7509 |
+
{
|
| 7510 |
+
"epoch": 1.6804352206326818,
|
| 7511 |
+
"grad_norm": 11.1875,
|
| 7512 |
+
"learning_rate": 8.798441802673116e-06,
|
| 7513 |
+
"loss": 0.7674,
|
| 7514 |
+
"mean_token_accuracy": 0.8027099728584289,
|
| 7515 |
+
"num_tokens": 9236745.0,
|
| 7516 |
+
"step": 8340
|
| 7517 |
+
},
|
| 7518 |
+
{
|
| 7519 |
+
"epoch": 1.6824501309691717,
|
| 7520 |
+
"grad_norm": 12.6875,
|
| 7521 |
+
"learning_rate": 8.785009067096515e-06,
|
| 7522 |
+
"loss": 0.7154,
|
| 7523 |
+
"mean_token_accuracy": 0.811217075586319,
|
| 7524 |
+
"num_tokens": 9246893.0,
|
| 7525 |
+
"step": 8350
|
| 7526 |
+
},
|
| 7527 |
+
{
|
| 7528 |
+
"epoch": 1.6844650413056619,
|
| 7529 |
+
"grad_norm": 10.8125,
|
| 7530 |
+
"learning_rate": 8.771576331519914e-06,
|
| 7531 |
+
"loss": 0.7992,
|
| 7532 |
+
"mean_token_accuracy": 0.8010998785495758,
|
| 7533 |
+
"num_tokens": 9257127.0,
|
| 7534 |
+
"step": 8360
|
| 7535 |
+
},
|
| 7536 |
+
{
|
| 7537 |
+
"epoch": 1.686479951642152,
|
| 7538 |
+
"grad_norm": 11.75,
|
| 7539 |
+
"learning_rate": 8.758143595943314e-06,
|
| 7540 |
+
"loss": 0.7553,
|
| 7541 |
+
"mean_token_accuracy": 0.8025872766971588,
|
| 7542 |
+
"num_tokens": 9267345.0,
|
| 7543 |
+
"step": 8370
|
| 7544 |
+
},
|
| 7545 |
+
{
|
| 7546 |
+
"epoch": 1.688494861978642,
|
| 7547 |
+
"grad_norm": 10.5625,
|
| 7548 |
+
"learning_rate": 8.744710860366715e-06,
|
| 7549 |
+
"loss": 0.7177,
|
| 7550 |
+
"mean_token_accuracy": 0.807683116197586,
|
| 7551 |
+
"num_tokens": 9278348.0,
|
| 7552 |
+
"step": 8380
|
| 7553 |
+
},
|
| 7554 |
+
{
|
| 7555 |
+
"epoch": 1.6905097723151319,
|
| 7556 |
+
"grad_norm": 10.6875,
|
| 7557 |
+
"learning_rate": 8.731278124790115e-06,
|
| 7558 |
+
"loss": 0.824,
|
| 7559 |
+
"mean_token_accuracy": 0.7992592275142669,
|
| 7560 |
+
"num_tokens": 9289759.0,
|
| 7561 |
+
"step": 8390
|
| 7562 |
+
},
|
| 7563 |
+
{
|
| 7564 |
+
"epoch": 1.692524682651622,
|
| 7565 |
+
"grad_norm": 10.0,
|
| 7566 |
+
"learning_rate": 8.717845389213514e-06,
|
| 7567 |
+
"loss": 0.7137,
|
| 7568 |
+
"mean_token_accuracy": 0.8179818749427795,
|
| 7569 |
+
"num_tokens": 9301077.0,
|
| 7570 |
+
"step": 8400
|
| 7571 |
+
},
|
| 7572 |
+
{
|
| 7573 |
+
"epoch": 1.6945395929881122,
|
| 7574 |
+
"grad_norm": 9.625,
|
| 7575 |
+
"learning_rate": 8.704412653636913e-06,
|
| 7576 |
+
"loss": 0.7854,
|
| 7577 |
+
"mean_token_accuracy": 0.8026704370975495,
|
| 7578 |
+
"num_tokens": 9311874.0,
|
| 7579 |
+
"step": 8410
|
| 7580 |
+
},
|
| 7581 |
+
{
|
| 7582 |
+
"epoch": 1.696554503324602,
|
| 7583 |
+
"grad_norm": 10.5,
|
| 7584 |
+
"learning_rate": 8.690979918060313e-06,
|
| 7585 |
+
"loss": 0.7699,
|
| 7586 |
+
"mean_token_accuracy": 0.8098136365413666,
|
| 7587 |
+
"num_tokens": 9322554.0,
|
| 7588 |
+
"step": 8420
|
| 7589 |
+
},
|
| 7590 |
+
{
|
| 7591 |
+
"epoch": 1.698569413661092,
|
| 7592 |
+
"grad_norm": 13.5,
|
| 7593 |
+
"learning_rate": 8.677547182483714e-06,
|
| 7594 |
+
"loss": 0.9204,
|
| 7595 |
+
"mean_token_accuracy": 0.7753970444202423,
|
| 7596 |
+
"num_tokens": 9334306.0,
|
| 7597 |
+
"step": 8430
|
| 7598 |
+
},
|
| 7599 |
+
{
|
| 7600 |
+
"epoch": 1.7005843239975822,
|
| 7601 |
+
"grad_norm": 10.375,
|
| 7602 |
+
"learning_rate": 8.664114446907113e-06,
|
| 7603 |
+
"loss": 0.7777,
|
| 7604 |
+
"mean_token_accuracy": 0.8091802179813385,
|
| 7605 |
+
"num_tokens": 9347003.0,
|
| 7606 |
+
"step": 8440
|
| 7607 |
+
},
|
| 7608 |
+
{
|
| 7609 |
+
"epoch": 1.7025992343340721,
|
| 7610 |
+
"grad_norm": 15.5625,
|
| 7611 |
+
"learning_rate": 8.650681711330513e-06,
|
| 7612 |
+
"loss": 0.7196,
|
| 7613 |
+
"mean_token_accuracy": 0.8092711210250855,
|
| 7614 |
+
"num_tokens": 9357300.0,
|
| 7615 |
+
"step": 8450
|
| 7616 |
+
},
|
| 7617 |
+
{
|
| 7618 |
+
"epoch": 1.704614144670562,
|
| 7619 |
+
"grad_norm": 11.3125,
|
| 7620 |
+
"learning_rate": 8.637248975753914e-06,
|
| 7621 |
+
"loss": 0.826,
|
| 7622 |
+
"mean_token_accuracy": 0.7981557488441468,
|
| 7623 |
+
"num_tokens": 9369935.0,
|
| 7624 |
+
"step": 8460
|
| 7625 |
+
},
|
| 7626 |
+
{
|
| 7627 |
+
"epoch": 1.7066290550070522,
|
| 7628 |
+
"grad_norm": 10.125,
|
| 7629 |
+
"learning_rate": 8.623816240177313e-06,
|
| 7630 |
+
"loss": 0.8045,
|
| 7631 |
+
"mean_token_accuracy": 0.8022239625453949,
|
| 7632 |
+
"num_tokens": 9381204.0,
|
| 7633 |
+
"step": 8470
|
| 7634 |
+
},
|
| 7635 |
+
{
|
| 7636 |
+
"epoch": 1.7086439653435423,
|
| 7637 |
+
"grad_norm": 11.9375,
|
| 7638 |
+
"learning_rate": 8.610383504600712e-06,
|
| 7639 |
+
"loss": 0.7834,
|
| 7640 |
+
"mean_token_accuracy": 0.7945830345153808,
|
| 7641 |
+
"num_tokens": 9391506.0,
|
| 7642 |
+
"step": 8480
|
| 7643 |
+
},
|
| 7644 |
+
{
|
| 7645 |
+
"epoch": 1.7106588756800323,
|
| 7646 |
+
"grad_norm": 11.75,
|
| 7647 |
+
"learning_rate": 8.596950769024112e-06,
|
| 7648 |
+
"loss": 0.8079,
|
| 7649 |
+
"mean_token_accuracy": 0.8012938261032104,
|
| 7650 |
+
"num_tokens": 9402691.0,
|
| 7651 |
+
"step": 8490
|
| 7652 |
+
},
|
| 7653 |
+
{
|
| 7654 |
+
"epoch": 1.7126737860165222,
|
| 7655 |
+
"grad_norm": 10.8125,
|
| 7656 |
+
"learning_rate": 8.583518033447513e-06,
|
| 7657 |
+
"loss": 0.8476,
|
| 7658 |
+
"mean_token_accuracy": 0.7890658736228943,
|
| 7659 |
+
"num_tokens": 9414095.0,
|
| 7660 |
+
"step": 8500
|
| 7661 |
}
|
| 7662 |
],
|
| 7663 |
"logging_steps": 10,
|
|
|
|
| 7677 |
"attributes": {}
|
| 7678 |
}
|
| 7679 |
},
|
| 7680 |
+
"total_flos": 1.1396175021686784e+16,
|
| 7681 |
"train_batch_size": 8,
|
| 7682 |
"trial_name": null,
|
| 7683 |
"trial_params": null
|