Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +1 -0
- checkpoints/metadata_000000032768.json +1 -1
- checkpoints/metadata_000000327680.json +1 -1
- checkpoints/metadata_000000360448.json +1 -1
- checkpoints/metadata_000000425984.json +1 -1
- checkpoints/metadata_000000458752.json +1 -1
- checkpoints/metadata_000000491520.json +1 -1
- checkpoints/metadata_000000557056.json +1 -1
- checkpoints/metadata_000000622592.json +1 -1
- checkpoints/metadata_000000688128.json +1 -1
- checkpoints/metadata_000000753664.json +1 -1
- checkpoints/metadata_000000819200.json +1 -1
- checkpoints/metadata_000000917504.json +1 -1
- checkpoints/metadata_000000983040.json +1 -1
- checkpoints/metadata_000001114112.json +1 -1
- checkpoints/metadata_000001212416.json +1 -1
- checkpoints/metadata_000001343488.json +1 -1
- checkpoints/metadata_000001474560.json +1 -1
- checkpoints/metadata_000001605632.json +1 -1
- checkpoints/metadata_000001769472.json +1 -1
- checkpoints/metadata_000001966080.json +1 -1
- checkpoints/metadata_000002162688.json +1 -1
- checkpoints/metadata_000002359296.json +1 -1
- checkpoints/metadata_000002621440.json +1 -1
- checkpoints/metadata_000002883584.json +1 -1
- checkpoints/metadata_000003178496.json +1 -1
- checkpoints/metadata_000003473408.json +1 -1
- checkpoints/metadata_000003833856.json +1 -1
- checkpoints/metadata_000004227072.json +1 -1
- checkpoints/metadata_000004653056.json +1 -1
- checkpoints/metadata_000005111808.json +1 -1
- checkpoints/metadata_000005603328.json +1 -1
- checkpoints/metadata_000006193152.json +1 -1
- checkpoints/metadata_000006782976.json +1 -1
- checkpoints/metadata_000007471104.json +1 -1
- checkpoints/metadata_000008224768.json +1 -1
- checkpoints/metadata_000009043968.json +1 -1
- checkpoints/metadata_000009961472.json +1 -1
- checkpoints/metadata_000010944512.json +1 -1
- checkpoints/metadata_000012058624.json +1 -1
- checkpoints/metadata_000013271040.json +1 -1
- checkpoints/metadata_000014581760.json +1 -1
- checkpoints/metadata_000016056320.json +1 -1
- checkpoints/metadata_000016384000.json +1 -1
- checkpoints/metadata_000017661952.json +1 -1
- checkpoints/metadata_000019431424.json +1 -1
- checkpoints/metadata_000021364736.json +1 -1
- checkpoints/metadata_000023494656.json +1 -1
- checkpoints/metadata_000025853952.json +1 -1
- checkpoints/metadata_000028442624.json +1 -1
.gitattributes
CHANGED
|
@@ -37,3 +37,4 @@ wandb/run-20260226_135602-696nxyfr/run-696nxyfr.wandb filter=lfs diff=lfs merge=
|
|
| 37 |
wandb/run-20260226_153026-trcpjlfd/run-trcpjlfd.wandb filter=lfs diff=lfs merge=lfs -text
|
| 38 |
wandb/run-20260319_063518-29lbcxak/run-29lbcxak.wandb filter=lfs diff=lfs merge=lfs -text
|
| 39 |
wandb/run-20260319_091054-lisp43b6/run-lisp43b6.wandb filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 37 |
wandb/run-20260226_153026-trcpjlfd/run-trcpjlfd.wandb filter=lfs diff=lfs merge=lfs -text
|
| 38 |
wandb/run-20260319_063518-29lbcxak/run-29lbcxak.wandb filter=lfs diff=lfs merge=lfs -text
|
| 39 |
wandb/run-20260319_091054-lisp43b6/run-lisp43b6.wandb filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
wandb/run-20260325_092121-4guua5vm/run-4guua5vm.wandb filter=lfs diff=lfs merge=lfs -text
|
checkpoints/metadata_000000032768.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 1, "tokens_seen": 32768, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 1, "tokens_seen": 32768, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.490738868713379}
|
checkpoints/metadata_000000327680.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 10, "tokens_seen": 327680, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 10, "tokens_seen": 327680, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.488559456099267}
|
checkpoints/metadata_000000360448.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 11, "tokens_seen": 360448, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 11, "tokens_seen": 360448, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.487681315456733}
|
checkpoints/metadata_000000425984.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 13, "tokens_seen": 425984, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 13, "tokens_seen": 425984, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.485140885608613}
|
checkpoints/metadata_000000458752.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step":
|
|
|
|
| 1 |
+
{"step": 14, "tokens_seen": 458752, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.48362769925619}
|
checkpoints/metadata_000000491520.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step":
|
|
|
|
| 1 |
+
{"step": 15, "tokens_seen": 491520, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.481805614844804}
|
checkpoints/metadata_000000557056.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step":
|
|
|
|
| 1 |
+
{"step": 17, "tokens_seen": 557056, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.477386701187243}
|
checkpoints/metadata_000000622592.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 19, "tokens_seen": 622592, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 19, "tokens_seen": 622592, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.472137723118012}
|
checkpoints/metadata_000000688128.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 21, "tokens_seen": 688128, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 21, "tokens_seen": 688128, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.465674521810133}
|
checkpoints/metadata_000000753664.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 23, "tokens_seen": 753664, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 23, "tokens_seen": 753664, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.457512075850191}
|
checkpoints/metadata_000000819200.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 25, "tokens_seen": 819200, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 25, "tokens_seen": 819200, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.449279721179963}
|
checkpoints/metadata_000000917504.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 28, "tokens_seen": 917504, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 28, "tokens_seen": 917504, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.434475006481613}
|
checkpoints/metadata_000000983040.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step":
|
|
|
|
| 1 |
+
{"step": 30, "tokens_seen": 983040, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.423211635413145}
|
checkpoints/metadata_000001114112.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 34, "tokens_seen": 1114112, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.399444538393796}
|
checkpoints/metadata_000001212416.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 37, "tokens_seen": 1212416, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.381340059139271}
|
checkpoints/metadata_000001343488.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 41, "tokens_seen": 1343488, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.356302876533771}
|
checkpoints/metadata_000001474560.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 45, "tokens_seen": 1474560, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 45, "tokens_seen": 1474560, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.328675141025588}
|
checkpoints/metadata_000001605632.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 49, "tokens_seen": 1605632, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.299211628700437}
|
checkpoints/metadata_000001769472.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 54, "tokens_seen": 1769472, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.261818793867313}
|
checkpoints/metadata_000001966080.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 60, "tokens_seen": 1966080, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 60, "tokens_seen": 1966080, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.213050824344208}
|
checkpoints/metadata_000002162688.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 66, "tokens_seen": 2162688, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.161067744520894}
|
checkpoints/metadata_000002359296.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 72, "tokens_seen": 2359296, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 72, "tokens_seen": 2359296, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.104137801951891}
|
checkpoints/metadata_000002621440.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 80, "tokens_seen": 2621440, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 10.0240865981226}
|
checkpoints/metadata_000002883584.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 88, "tokens_seen": 2883584, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.935583404823696}
|
checkpoints/metadata_000003178496.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 97, "tokens_seen": 3178496, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 97, "tokens_seen": 3178496, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.8319135957099}
|
checkpoints/metadata_000003473408.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 106, "tokens_seen": 3473408, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 106, "tokens_seen": 3473408, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.718994025528087}
|
checkpoints/metadata_000003833856.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 117, "tokens_seen": 3833856, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 117, "tokens_seen": 3833856, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.57546330601019}
|
checkpoints/metadata_000004227072.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 129, "tokens_seen": 4227072, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 129, "tokens_seen": 4227072, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.408515196645045}
|
checkpoints/metadata_000004653056.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 142, "tokens_seen": 4653056, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 142, "tokens_seen": 4653056, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.23164245179911}
|
checkpoints/metadata_000005111808.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 156, "tokens_seen": 5111808, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 156, "tokens_seen": 5111808, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 9.057175241009995}
|
checkpoints/metadata_000005603328.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 171, "tokens_seen": 5603328, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 171, "tokens_seen": 5603328, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.882055448413595}
|
checkpoints/metadata_000006193152.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 189, "tokens_seen": 6193152, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.695822624511257}
|
checkpoints/metadata_000006782976.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 207, "tokens_seen": 6782976, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 207, "tokens_seen": 6782976, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.536231485948507}
|
checkpoints/metadata_000007471104.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 228, "tokens_seen": 7471104, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 228, "tokens_seen": 7471104, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.364853085187022}
|
checkpoints/metadata_000008224768.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 251, "tokens_seen": 8224768, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 251, "tokens_seen": 8224768, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.20858914456734}
|
checkpoints/metadata_000009043968.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 276, "tokens_seen": 9043968, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 276, "tokens_seen": 9043968, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 8.057145939779847}
|
checkpoints/metadata_000009961472.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 304, "tokens_seen": 9961472, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 304, "tokens_seen": 9961472, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.9097705594246275}
|
checkpoints/metadata_000010944512.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 334, "tokens_seen": 10944512, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 334, "tokens_seen": 10944512, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.779419562356378}
|
checkpoints/metadata_000012058624.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 368, "tokens_seen": 12058624, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 368, "tokens_seen": 12058624, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.656809406264156}
|
checkpoints/metadata_000013271040.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 405, "tokens_seen": 13271040, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 405, "tokens_seen": 13271040, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.54616898967276}
|
checkpoints/metadata_000014581760.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 445, "tokens_seen": 14581760, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 445, "tokens_seen": 14581760, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.444932654284786}
|
checkpoints/metadata_000016056320.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 490, "tokens_seen": 16056320, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 490, "tokens_seen": 16056320, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.351047587313347}
|
checkpoints/metadata_000016384000.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step":
|
|
|
|
| 1 |
+
{"step": 500, "tokens_seen": 16384000, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.338734772199267}
|
checkpoints/metadata_000017661952.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 539, "tokens_seen": 17661952, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 539, "tokens_seen": 17661952, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.270154893091622}
|
checkpoints/metadata_000019431424.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 593, "tokens_seen": 19431424, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 593, "tokens_seen": 19431424, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.18724598299121}
|
checkpoints/metadata_000021364736.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 652, "tokens_seen": 21364736, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 652, "tokens_seen": 21364736, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.126651679338602}
|
checkpoints/metadata_000023494656.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 717, "tokens_seen": 23494656, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 717, "tokens_seen": 23494656, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 7.061925158726517}
|
checkpoints/metadata_000025853952.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 789, "tokens_seen": 25853952, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 789, "tokens_seen": 25853952, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.993057823476354}
|
checkpoints/metadata_000028442624.json
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"step": 868, "tokens_seen": 28442624, "config": {"model_name": "
|
|
|
|
| 1 |
+
{"step": 868, "tokens_seen": 28442624, "config": {"model_name": "pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "attn_only": false, "layer_norm_eps": 1e-05, "init_range": 0.02, "n_ctx": 1024, "d_vocab": 32000, "dataset_name": "eoinf/pile_llama_mix_within_rows_pile_all_random_tokens_uniform_frac0d2", "tokenizer_name": "", "seed": 10, "data_seed": 10, "device": "cuda", "use_bfloat16_matmul": false, "batch_size_per_device": 32, "n_devices": 1, "batches_per_step": 1, "max_tokens": 200000000, "lr_hidden": 0.001, "lr_vector": 0.0005, "lr_schedule": "constant_with_warmup", "warmup_tokens": 30000000, "weight_decay": 0.05, "grad_norm_clip": 1.0, "train_loss_moving_average_beta": 0.99, "log_interval": 25, "save_checkpoints": true, "checkpoint_interval": 500, "checkpoint_interval_ratio": 1.1, "save_log_checkpoints": true, "use_wandb": true, "batch_size": 32, "tokens_per_step": 32768, "warmup_steps": 915, "max_steps": 6103}, "train_loss_ewma": 6.930244985859927}
|