| { | |
| "d_model": 384, | |
| "n_layers": 8, | |
| "n_heads": 8, | |
| "n_kv_heads": 4, | |
| "d_ff": 1024, | |
| "K": 16, | |
| "max_len": 512, | |
| "entropy_reg": 0.02, | |
| "batch_size": 16, | |
| "seq_len": 256, | |
| "lr": 0.0003, | |
| "steps": 30000, | |
| "warmup_steps": 500, | |
| "val_every": 500, | |
| "save_every": 2500, | |
| "probe_steps": [ | |
| 1500, | |
| 5000, | |
| 15000, | |
| 30000 | |
| ], | |
| "vocab_size": 50257, | |
| "n_params": 37107592, | |
| "architecture": "CDM_V2_code", | |
| "device": "cuda", | |
| "dataset": "bigcode/starcoderdata:python" | |
| } |