Training in progress, step 4000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17ab6fbe9c97d82ef7dac860e0afd63f233555e8f23a9fd5286c2c92aa0de809
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2fdccc0924c16c14bbca889730272d2d9adcc2fdeb5cc2188b22634e6a65ba6
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b9d0e16227a53d102f718b321b6ebc380604ad5e862513fc6df0711cea1a67f
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -3158,6 +3158,456 @@
|
|
| 3158 |
"mean_token_accuracy": 0.7891253709793091,
|
| 3159 |
"num_tokens": 3879065.0,
|
| 3160 |
"step": 3500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3161 |
}
|
| 3162 |
],
|
| 3163 |
"logging_steps": 10,
|
|
@@ -3177,7 +3627,7 @@
|
|
| 3177 |
"attributes": {}
|
| 3178 |
}
|
| 3179 |
},
|
| 3180 |
-
"total_flos":
|
| 3181 |
"train_batch_size": 8,
|
| 3182 |
"trial_name": null,
|
| 3183 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.8059641345960105,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 4000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 3158 |
"mean_token_accuracy": 0.7891253709793091,
|
| 3159 |
"num_tokens": 3879065.0,
|
| 3160 |
"step": 3500
|
| 3161 |
+
},
|
| 3162 |
+
{
|
| 3163 |
+
"epoch": 0.7072335281079992,
|
| 3164 |
+
"grad_norm": 12.25,
|
| 3165 |
+
"learning_rate": 1.5286453086171e-05,
|
| 3166 |
+
"loss": 1.021,
|
| 3167 |
+
"mean_token_accuracy": 0.7534075140953064,
|
| 3168 |
+
"num_tokens": 3890409.0,
|
| 3169 |
+
"step": 3510
|
| 3170 |
+
},
|
| 3171 |
+
{
|
| 3172 |
+
"epoch": 0.7092484384444893,
|
| 3173 |
+
"grad_norm": 11.0625,
|
| 3174 |
+
"learning_rate": 1.52730203505944e-05,
|
| 3175 |
+
"loss": 0.9314,
|
| 3176 |
+
"mean_token_accuracy": 0.7765843331813812,
|
| 3177 |
+
"num_tokens": 3902208.0,
|
| 3178 |
+
"step": 3520
|
| 3179 |
+
},
|
| 3180 |
+
{
|
| 3181 |
+
"epoch": 0.7112633487809793,
|
| 3182 |
+
"grad_norm": 9.875,
|
| 3183 |
+
"learning_rate": 1.52595876150178e-05,
|
| 3184 |
+
"loss": 0.7786,
|
| 3185 |
+
"mean_token_accuracy": 0.8059293925762177,
|
| 3186 |
+
"num_tokens": 3913221.0,
|
| 3187 |
+
"step": 3530
|
| 3188 |
+
},
|
| 3189 |
+
{
|
| 3190 |
+
"epoch": 0.7132782591174692,
|
| 3191 |
+
"grad_norm": 13.1875,
|
| 3192 |
+
"learning_rate": 1.52461548794412e-05,
|
| 3193 |
+
"loss": 0.8675,
|
| 3194 |
+
"mean_token_accuracy": 0.7885148406028748,
|
| 3195 |
+
"num_tokens": 3923202.0,
|
| 3196 |
+
"step": 3540
|
| 3197 |
+
},
|
| 3198 |
+
{
|
| 3199 |
+
"epoch": 0.7152931694539593,
|
| 3200 |
+
"grad_norm": 11.625,
|
| 3201 |
+
"learning_rate": 1.52327221438646e-05,
|
| 3202 |
+
"loss": 0.945,
|
| 3203 |
+
"mean_token_accuracy": 0.7707227051258088,
|
| 3204 |
+
"num_tokens": 3933469.0,
|
| 3205 |
+
"step": 3550
|
| 3206 |
+
},
|
| 3207 |
+
{
|
| 3208 |
+
"epoch": 0.7173080797904493,
|
| 3209 |
+
"grad_norm": 11.75,
|
| 3210 |
+
"learning_rate": 1.5219289408287999e-05,
|
| 3211 |
+
"loss": 0.8829,
|
| 3212 |
+
"mean_token_accuracy": 0.7817340910434722,
|
| 3213 |
+
"num_tokens": 3944523.0,
|
| 3214 |
+
"step": 3560
|
| 3215 |
+
},
|
| 3216 |
+
{
|
| 3217 |
+
"epoch": 0.7193229901269393,
|
| 3218 |
+
"grad_norm": 13.5625,
|
| 3219 |
+
"learning_rate": 1.52058566727114e-05,
|
| 3220 |
+
"loss": 0.9075,
|
| 3221 |
+
"mean_token_accuracy": 0.7843019485473632,
|
| 3222 |
+
"num_tokens": 3955650.0,
|
| 3223 |
+
"step": 3570
|
| 3224 |
+
},
|
| 3225 |
+
{
|
| 3226 |
+
"epoch": 0.7213379004634294,
|
| 3227 |
+
"grad_norm": 11.25,
|
| 3228 |
+
"learning_rate": 1.51924239371348e-05,
|
| 3229 |
+
"loss": 0.9087,
|
| 3230 |
+
"mean_token_accuracy": 0.7732051312923431,
|
| 3231 |
+
"num_tokens": 3967014.0,
|
| 3232 |
+
"step": 3580
|
| 3233 |
+
},
|
| 3234 |
+
{
|
| 3235 |
+
"epoch": 0.7233528107999194,
|
| 3236 |
+
"grad_norm": 11.0625,
|
| 3237 |
+
"learning_rate": 1.5178991201558197e-05,
|
| 3238 |
+
"loss": 0.8595,
|
| 3239 |
+
"mean_token_accuracy": 0.7891602039337158,
|
| 3240 |
+
"num_tokens": 3977669.0,
|
| 3241 |
+
"step": 3590
|
| 3242 |
+
},
|
| 3243 |
+
{
|
| 3244 |
+
"epoch": 0.7253677211364095,
|
| 3245 |
+
"grad_norm": 9.4375,
|
| 3246 |
+
"learning_rate": 1.5165558465981597e-05,
|
| 3247 |
+
"loss": 0.8617,
|
| 3248 |
+
"mean_token_accuracy": 0.7840434491634369,
|
| 3249 |
+
"num_tokens": 3989279.0,
|
| 3250 |
+
"step": 3600
|
| 3251 |
+
},
|
| 3252 |
+
{
|
| 3253 |
+
"epoch": 0.7273826314728995,
|
| 3254 |
+
"grad_norm": 11.3125,
|
| 3255 |
+
"learning_rate": 1.5152125730404998e-05,
|
| 3256 |
+
"loss": 0.8872,
|
| 3257 |
+
"mean_token_accuracy": 0.7797149956226349,
|
| 3258 |
+
"num_tokens": 4000448.0,
|
| 3259 |
+
"step": 3610
|
| 3260 |
+
},
|
| 3261 |
+
{
|
| 3262 |
+
"epoch": 0.7293975418093894,
|
| 3263 |
+
"grad_norm": 9.375,
|
| 3264 |
+
"learning_rate": 1.5138692994828398e-05,
|
| 3265 |
+
"loss": 0.8267,
|
| 3266 |
+
"mean_token_accuracy": 0.7977706253528595,
|
| 3267 |
+
"num_tokens": 4010787.0,
|
| 3268 |
+
"step": 3620
|
| 3269 |
+
},
|
| 3270 |
+
{
|
| 3271 |
+
"epoch": 0.7314124521458795,
|
| 3272 |
+
"grad_norm": 11.5625,
|
| 3273 |
+
"learning_rate": 1.5125260259251797e-05,
|
| 3274 |
+
"loss": 0.9227,
|
| 3275 |
+
"mean_token_accuracy": 0.7788041710853577,
|
| 3276 |
+
"num_tokens": 4021823.0,
|
| 3277 |
+
"step": 3630
|
| 3278 |
+
},
|
| 3279 |
+
{
|
| 3280 |
+
"epoch": 0.7334273624823695,
|
| 3281 |
+
"grad_norm": 9.1875,
|
| 3282 |
+
"learning_rate": 1.5111827523675198e-05,
|
| 3283 |
+
"loss": 0.988,
|
| 3284 |
+
"mean_token_accuracy": 0.7647354364395141,
|
| 3285 |
+
"num_tokens": 4034278.0,
|
| 3286 |
+
"step": 3640
|
| 3287 |
+
},
|
| 3288 |
+
{
|
| 3289 |
+
"epoch": 0.7354422728188595,
|
| 3290 |
+
"grad_norm": 10.8125,
|
| 3291 |
+
"learning_rate": 1.5098394788098598e-05,
|
| 3292 |
+
"loss": 1.0355,
|
| 3293 |
+
"mean_token_accuracy": 0.7544133722782135,
|
| 3294 |
+
"num_tokens": 4045371.0,
|
| 3295 |
+
"step": 3650
|
| 3296 |
+
},
|
| 3297 |
+
{
|
| 3298 |
+
"epoch": 0.7374571831553496,
|
| 3299 |
+
"grad_norm": 11.875,
|
| 3300 |
+
"learning_rate": 1.5084962052521997e-05,
|
| 3301 |
+
"loss": 0.8856,
|
| 3302 |
+
"mean_token_accuracy": 0.7889864265918731,
|
| 3303 |
+
"num_tokens": 4056216.0,
|
| 3304 |
+
"step": 3660
|
| 3305 |
+
},
|
| 3306 |
+
{
|
| 3307 |
+
"epoch": 0.7394720934918396,
|
| 3308 |
+
"grad_norm": 11.0625,
|
| 3309 |
+
"learning_rate": 1.5071529316945398e-05,
|
| 3310 |
+
"loss": 0.942,
|
| 3311 |
+
"mean_token_accuracy": 0.7709968864917756,
|
| 3312 |
+
"num_tokens": 4066100.0,
|
| 3313 |
+
"step": 3670
|
| 3314 |
+
},
|
| 3315 |
+
{
|
| 3316 |
+
"epoch": 0.7414870038283297,
|
| 3317 |
+
"grad_norm": 10.5625,
|
| 3318 |
+
"learning_rate": 1.5058096581368798e-05,
|
| 3319 |
+
"loss": 0.8474,
|
| 3320 |
+
"mean_token_accuracy": 0.7917793452739715,
|
| 3321 |
+
"num_tokens": 4076638.0,
|
| 3322 |
+
"step": 3680
|
| 3323 |
+
},
|
| 3324 |
+
{
|
| 3325 |
+
"epoch": 0.7435019141648197,
|
| 3326 |
+
"grad_norm": 12.625,
|
| 3327 |
+
"learning_rate": 1.5044663845792197e-05,
|
| 3328 |
+
"loss": 0.8937,
|
| 3329 |
+
"mean_token_accuracy": 0.779036569595337,
|
| 3330 |
+
"num_tokens": 4088398.0,
|
| 3331 |
+
"step": 3690
|
| 3332 |
+
},
|
| 3333 |
+
{
|
| 3334 |
+
"epoch": 0.7455168245013097,
|
| 3335 |
+
"grad_norm": 13.0625,
|
| 3336 |
+
"learning_rate": 1.5031231110215596e-05,
|
| 3337 |
+
"loss": 0.9305,
|
| 3338 |
+
"mean_token_accuracy": 0.7710303366184235,
|
| 3339 |
+
"num_tokens": 4100854.0,
|
| 3340 |
+
"step": 3700
|
| 3341 |
+
},
|
| 3342 |
+
{
|
| 3343 |
+
"epoch": 0.7475317348377997,
|
| 3344 |
+
"grad_norm": 10.375,
|
| 3345 |
+
"learning_rate": 1.5017798374638996e-05,
|
| 3346 |
+
"loss": 0.9195,
|
| 3347 |
+
"mean_token_accuracy": 0.7792443215847016,
|
| 3348 |
+
"num_tokens": 4113144.0,
|
| 3349 |
+
"step": 3710
|
| 3350 |
+
},
|
| 3351 |
+
{
|
| 3352 |
+
"epoch": 0.7495466451742897,
|
| 3353 |
+
"grad_norm": 10.25,
|
| 3354 |
+
"learning_rate": 1.5004365639062397e-05,
|
| 3355 |
+
"loss": 0.8205,
|
| 3356 |
+
"mean_token_accuracy": 0.8025223255157471,
|
| 3357 |
+
"num_tokens": 4124241.0,
|
| 3358 |
+
"step": 3720
|
| 3359 |
+
},
|
| 3360 |
+
{
|
| 3361 |
+
"epoch": 0.7515615555107797,
|
| 3362 |
+
"grad_norm": 13.375,
|
| 3363 |
+
"learning_rate": 1.4990932903485796e-05,
|
| 3364 |
+
"loss": 0.7566,
|
| 3365 |
+
"mean_token_accuracy": 0.8099412024021149,
|
| 3366 |
+
"num_tokens": 4134131.0,
|
| 3367 |
+
"step": 3730
|
| 3368 |
+
},
|
| 3369 |
+
{
|
| 3370 |
+
"epoch": 0.7535764658472698,
|
| 3371 |
+
"grad_norm": 9.3125,
|
| 3372 |
+
"learning_rate": 1.4977500167909196e-05,
|
| 3373 |
+
"loss": 0.8882,
|
| 3374 |
+
"mean_token_accuracy": 0.7791573405265808,
|
| 3375 |
+
"num_tokens": 4144499.0,
|
| 3376 |
+
"step": 3740
|
| 3377 |
+
},
|
| 3378 |
+
{
|
| 3379 |
+
"epoch": 0.7555913761837598,
|
| 3380 |
+
"grad_norm": 13.4375,
|
| 3381 |
+
"learning_rate": 1.4964067432332597e-05,
|
| 3382 |
+
"loss": 0.8661,
|
| 3383 |
+
"mean_token_accuracy": 0.7914558589458466,
|
| 3384 |
+
"num_tokens": 4155442.0,
|
| 3385 |
+
"step": 3750
|
| 3386 |
+
},
|
| 3387 |
+
{
|
| 3388 |
+
"epoch": 0.7576062865202499,
|
| 3389 |
+
"grad_norm": 13.5625,
|
| 3390 |
+
"learning_rate": 1.4950634696755994e-05,
|
| 3391 |
+
"loss": 0.8986,
|
| 3392 |
+
"mean_token_accuracy": 0.7791661143302917,
|
| 3393 |
+
"num_tokens": 4165905.0,
|
| 3394 |
+
"step": 3760
|
| 3395 |
+
},
|
| 3396 |
+
{
|
| 3397 |
+
"epoch": 0.7596211968567399,
|
| 3398 |
+
"grad_norm": 10.875,
|
| 3399 |
+
"learning_rate": 1.4937201961179395e-05,
|
| 3400 |
+
"loss": 0.9857,
|
| 3401 |
+
"mean_token_accuracy": 0.7646209299564362,
|
| 3402 |
+
"num_tokens": 4177252.0,
|
| 3403 |
+
"step": 3770
|
| 3404 |
+
},
|
| 3405 |
+
{
|
| 3406 |
+
"epoch": 0.7616361071932299,
|
| 3407 |
+
"grad_norm": 13.1875,
|
| 3408 |
+
"learning_rate": 1.4923769225602795e-05,
|
| 3409 |
+
"loss": 0.8163,
|
| 3410 |
+
"mean_token_accuracy": 0.8001804709434509,
|
| 3411 |
+
"num_tokens": 4187603.0,
|
| 3412 |
+
"step": 3780
|
| 3413 |
+
},
|
| 3414 |
+
{
|
| 3415 |
+
"epoch": 0.76365101752972,
|
| 3416 |
+
"grad_norm": 12.5625,
|
| 3417 |
+
"learning_rate": 1.4910336490026196e-05,
|
| 3418 |
+
"loss": 0.8719,
|
| 3419 |
+
"mean_token_accuracy": 0.793983542919159,
|
| 3420 |
+
"num_tokens": 4198158.0,
|
| 3421 |
+
"step": 3790
|
| 3422 |
+
},
|
| 3423 |
+
{
|
| 3424 |
+
"epoch": 0.7656659278662099,
|
| 3425 |
+
"grad_norm": 11.625,
|
| 3426 |
+
"learning_rate": 1.4896903754449594e-05,
|
| 3427 |
+
"loss": 0.8003,
|
| 3428 |
+
"mean_token_accuracy": 0.8059770345687867,
|
| 3429 |
+
"num_tokens": 4209371.0,
|
| 3430 |
+
"step": 3800
|
| 3431 |
+
},
|
| 3432 |
+
{
|
| 3433 |
+
"epoch": 0.7676808382026999,
|
| 3434 |
+
"grad_norm": 11.375,
|
| 3435 |
+
"learning_rate": 1.4883471018872995e-05,
|
| 3436 |
+
"loss": 0.8484,
|
| 3437 |
+
"mean_token_accuracy": 0.791538542509079,
|
| 3438 |
+
"num_tokens": 4220051.0,
|
| 3439 |
+
"step": 3810
|
| 3440 |
+
},
|
| 3441 |
+
{
|
| 3442 |
+
"epoch": 0.76969574853919,
|
| 3443 |
+
"grad_norm": 11.4375,
|
| 3444 |
+
"learning_rate": 1.4870038283296395e-05,
|
| 3445 |
+
"loss": 0.8216,
|
| 3446 |
+
"mean_token_accuracy": 0.7945187032222748,
|
| 3447 |
+
"num_tokens": 4230922.0,
|
| 3448 |
+
"step": 3820
|
| 3449 |
+
},
|
| 3450 |
+
{
|
| 3451 |
+
"epoch": 0.77171065887568,
|
| 3452 |
+
"grad_norm": 10.1875,
|
| 3453 |
+
"learning_rate": 1.4856605547719794e-05,
|
| 3454 |
+
"loss": 0.8319,
|
| 3455 |
+
"mean_token_accuracy": 0.7939063310623169,
|
| 3456 |
+
"num_tokens": 4242793.0,
|
| 3457 |
+
"step": 3830
|
| 3458 |
+
},
|
| 3459 |
+
{
|
| 3460 |
+
"epoch": 0.7737255692121701,
|
| 3461 |
+
"grad_norm": 14.125,
|
| 3462 |
+
"learning_rate": 1.4843172812143193e-05,
|
| 3463 |
+
"loss": 0.8577,
|
| 3464 |
+
"mean_token_accuracy": 0.7900285601615906,
|
| 3465 |
+
"num_tokens": 4253881.0,
|
| 3466 |
+
"step": 3840
|
| 3467 |
+
},
|
| 3468 |
+
{
|
| 3469 |
+
"epoch": 0.7757404795486601,
|
| 3470 |
+
"grad_norm": 10.875,
|
| 3471 |
+
"learning_rate": 1.4829740076566594e-05,
|
| 3472 |
+
"loss": 0.836,
|
| 3473 |
+
"mean_token_accuracy": 0.7931070744991302,
|
| 3474 |
+
"num_tokens": 4266304.0,
|
| 3475 |
+
"step": 3850
|
| 3476 |
+
},
|
| 3477 |
+
{
|
| 3478 |
+
"epoch": 0.7777553898851501,
|
| 3479 |
+
"grad_norm": 11.125,
|
| 3480 |
+
"learning_rate": 1.4816307340989994e-05,
|
| 3481 |
+
"loss": 1.0042,
|
| 3482 |
+
"mean_token_accuracy": 0.7616709470748901,
|
| 3483 |
+
"num_tokens": 4276817.0,
|
| 3484 |
+
"step": 3860
|
| 3485 |
+
},
|
| 3486 |
+
{
|
| 3487 |
+
"epoch": 0.7797703002216402,
|
| 3488 |
+
"grad_norm": 12.0625,
|
| 3489 |
+
"learning_rate": 1.4802874605413393e-05,
|
| 3490 |
+
"loss": 0.7827,
|
| 3491 |
+
"mean_token_accuracy": 0.8023504674434662,
|
| 3492 |
+
"num_tokens": 4286833.0,
|
| 3493 |
+
"step": 3870
|
| 3494 |
+
},
|
| 3495 |
+
{
|
| 3496 |
+
"epoch": 0.7817852105581302,
|
| 3497 |
+
"grad_norm": 12.125,
|
| 3498 |
+
"learning_rate": 1.4789441869836794e-05,
|
| 3499 |
+
"loss": 0.8489,
|
| 3500 |
+
"mean_token_accuracy": 0.7849018990993499,
|
| 3501 |
+
"num_tokens": 4297516.0,
|
| 3502 |
+
"step": 3880
|
| 3503 |
+
},
|
| 3504 |
+
{
|
| 3505 |
+
"epoch": 0.7838001208946201,
|
| 3506 |
+
"grad_norm": 11.625,
|
| 3507 |
+
"learning_rate": 1.4776009134260194e-05,
|
| 3508 |
+
"loss": 0.8809,
|
| 3509 |
+
"mean_token_accuracy": 0.7819288611412049,
|
| 3510 |
+
"num_tokens": 4309049.0,
|
| 3511 |
+
"step": 3890
|
| 3512 |
+
},
|
| 3513 |
+
{
|
| 3514 |
+
"epoch": 0.7858150312311102,
|
| 3515 |
+
"grad_norm": 10.625,
|
| 3516 |
+
"learning_rate": 1.4762576398683593e-05,
|
| 3517 |
+
"loss": 0.9198,
|
| 3518 |
+
"mean_token_accuracy": 0.7767218172550201,
|
| 3519 |
+
"num_tokens": 4320154.0,
|
| 3520 |
+
"step": 3900
|
| 3521 |
+
},
|
| 3522 |
+
{
|
| 3523 |
+
"epoch": 0.7878299415676002,
|
| 3524 |
+
"grad_norm": 12.0625,
|
| 3525 |
+
"learning_rate": 1.4749143663106993e-05,
|
| 3526 |
+
"loss": 0.9142,
|
| 3527 |
+
"mean_token_accuracy": 0.7742327690124512,
|
| 3528 |
+
"num_tokens": 4334166.0,
|
| 3529 |
+
"step": 3910
|
| 3530 |
+
},
|
| 3531 |
+
{
|
| 3532 |
+
"epoch": 0.7898448519040903,
|
| 3533 |
+
"grad_norm": 16.0,
|
| 3534 |
+
"learning_rate": 1.4735710927530394e-05,
|
| 3535 |
+
"loss": 0.8259,
|
| 3536 |
+
"mean_token_accuracy": 0.798123425245285,
|
| 3537 |
+
"num_tokens": 4344500.0,
|
| 3538 |
+
"step": 3920
|
| 3539 |
+
},
|
| 3540 |
+
{
|
| 3541 |
+
"epoch": 0.7918597622405803,
|
| 3542 |
+
"grad_norm": 12.1875,
|
| 3543 |
+
"learning_rate": 1.4722278191953791e-05,
|
| 3544 |
+
"loss": 0.8897,
|
| 3545 |
+
"mean_token_accuracy": 0.7863348364830017,
|
| 3546 |
+
"num_tokens": 4355554.0,
|
| 3547 |
+
"step": 3930
|
| 3548 |
+
},
|
| 3549 |
+
{
|
| 3550 |
+
"epoch": 0.7938746725770703,
|
| 3551 |
+
"grad_norm": 10.3125,
|
| 3552 |
+
"learning_rate": 1.4708845456377192e-05,
|
| 3553 |
+
"loss": 0.8904,
|
| 3554 |
+
"mean_token_accuracy": 0.7880643427371978,
|
| 3555 |
+
"num_tokens": 4365823.0,
|
| 3556 |
+
"step": 3940
|
| 3557 |
+
},
|
| 3558 |
+
{
|
| 3559 |
+
"epoch": 0.7958895829135604,
|
| 3560 |
+
"grad_norm": 15.375,
|
| 3561 |
+
"learning_rate": 1.4695412720800592e-05,
|
| 3562 |
+
"loss": 0.8622,
|
| 3563 |
+
"mean_token_accuracy": 0.7930482983589172,
|
| 3564 |
+
"num_tokens": 4377033.0,
|
| 3565 |
+
"step": 3950
|
| 3566 |
+
},
|
| 3567 |
+
{
|
| 3568 |
+
"epoch": 0.7979044932500504,
|
| 3569 |
+
"grad_norm": 11.75,
|
| 3570 |
+
"learning_rate": 1.4681979985223993e-05,
|
| 3571 |
+
"loss": 0.9426,
|
| 3572 |
+
"mean_token_accuracy": 0.7710152387619018,
|
| 3573 |
+
"num_tokens": 4387397.0,
|
| 3574 |
+
"step": 3960
|
| 3575 |
+
},
|
| 3576 |
+
{
|
| 3577 |
+
"epoch": 0.7999194035865403,
|
| 3578 |
+
"grad_norm": 10.5625,
|
| 3579 |
+
"learning_rate": 1.4668547249647392e-05,
|
| 3580 |
+
"loss": 0.8042,
|
| 3581 |
+
"mean_token_accuracy": 0.8007622241973877,
|
| 3582 |
+
"num_tokens": 4397683.0,
|
| 3583 |
+
"step": 3970
|
| 3584 |
+
},
|
| 3585 |
+
{
|
| 3586 |
+
"epoch": 0.8019343139230304,
|
| 3587 |
+
"grad_norm": 9.75,
|
| 3588 |
+
"learning_rate": 1.4655114514070792e-05,
|
| 3589 |
+
"loss": 0.8862,
|
| 3590 |
+
"mean_token_accuracy": 0.7830459952354432,
|
| 3591 |
+
"num_tokens": 4408500.0,
|
| 3592 |
+
"step": 3980
|
| 3593 |
+
},
|
| 3594 |
+
{
|
| 3595 |
+
"epoch": 0.8039492242595204,
|
| 3596 |
+
"grad_norm": 13.375,
|
| 3597 |
+
"learning_rate": 1.4641681778494193e-05,
|
| 3598 |
+
"loss": 0.9356,
|
| 3599 |
+
"mean_token_accuracy": 0.7747329294681549,
|
| 3600 |
+
"num_tokens": 4419148.0,
|
| 3601 |
+
"step": 3990
|
| 3602 |
+
},
|
| 3603 |
+
{
|
| 3604 |
+
"epoch": 0.8059641345960105,
|
| 3605 |
+
"grad_norm": 13.0625,
|
| 3606 |
+
"learning_rate": 1.462824904291759e-05,
|
| 3607 |
+
"loss": 0.9006,
|
| 3608 |
+
"mean_token_accuracy": 0.7772108554840088,
|
| 3609 |
+
"num_tokens": 4430041.0,
|
| 3610 |
+
"step": 4000
|
| 3611 |
}
|
| 3612 |
],
|
| 3613 |
"logging_steps": 10,
|
|
|
|
| 3627 |
"attributes": {}
|
| 3628 |
}
|
| 3629 |
},
|
| 3630 |
+
"total_flos": 5359648531077120.0,
|
| 3631 |
"train_batch_size": 8,
|
| 3632 |
"trial_name": null,
|
| 3633 |
"trial_params": null
|