Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +45 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/config.json +39 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/generation_config.json +6 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/merges.txt +0 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/pytorch_model.bin +3 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/special_tokens_map.json +6 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/tokenizer.json +0 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/tokenizer_config.json +21 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/vocab.json +0 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/config.json +39 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/generation_config.json +6 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/merges.txt +0 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/pytorch_model.bin +3 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/special_tokens_map.json +6 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/tokenizer.json +0 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/tokenizer_config.json +21 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/vocab.json +0 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/args.json +1 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/eval/0/answers.jsonl +0 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/eval/1/answers.jsonl +0 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/eval/2/answers.jsonl +0 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/eval/3/answers.jsonl +0 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/eval/4/answers.jsonl +0 -0
- distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/log.txt +0 -0
- distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/config.json +39 -0
- distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/generation_config.json +6 -0
- distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/merges.txt +0 -0
- distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/pytorch_model.bin +3 -0
- distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/special_tokens_map.json +6 -0
- distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/tokenizer.json +0 -0
- distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/tokenizer_config.json +21 -0
- distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/vocab.json +0 -0
- distillm/results/gpt2/train/fdd_0.1B_1.5B/args.json +1 -0
- distillm/results/gpt2/train/fdd_0.1B_1.5B/log.txt +0 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/config.json +39 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/generation_config.json +6 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/merges.txt +0 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/pytorch_model.bin +3 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/special_tokens_map.json +6 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/tokenizer.json +0 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/tokenizer_config.json +21 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/vocab.json +0 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/config.json +39 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/generation_config.json +6 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/merges.txt +0 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/pytorch_model.bin +3 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/special_tokens_map.json +6 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/tokenizer.json +0 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/tokenizer_config.json +21 -0
- distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/vocab.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,48 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
distillm/results/qwen1.5/distillm_0.5B_1.8B_on_srkl/3570/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
distillm/results/qwen1.5/fdd_0.5B_1.8B/3570/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
distillm/results/qwen1.5/sft/2856/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
distillm/results/qwen1.5/spandistillm_0.5B_1.8B_on_v5/2856/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
distillm/results/qwen1.5/spandistillm_0.5B_1.8B_on_v5/3570/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
distillm/results/qwen1.5/spandistillm_a1/1428/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
distillm/results/qwen1.5/spandistillm_a1/2142/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
distillm/results/qwen1.5/spandistillm_a1/2856/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
distillm/results/qwen1.5/spandistillm_a1/3570/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
distillm/results/qwen1.5/spandistillm_a1/714/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
distillm/results/qwen1.5/spandistillm_a2/1428/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
distillm/results/qwen1.5/spandistillm_a2/2142/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
distillm/results/qwen1.5/spandistillm_a2/2856/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
distillm/results/qwen1.5/spandistillm_a2/3570/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
distillm/results/qwen1.5/spandistillm_a2/714/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
distillm/results/qwen1.5/spandistillm_a3/1428/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
distillm/results/qwen1.5/spandistillm_a3/2142/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
distillm/results/qwen1.5/spandistillm_a3/2856/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
distillm/results/qwen1.5/spandistillm_a3/3570/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
distillm/results/qwen1.5/spandistillm_a3/714/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
distillm/results/qwen1.5/spandistillm_a4/1428/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
distillm/results/qwen1.5/spandistillm_a4/2142/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
distillm/results/qwen1.5/spandistillm_a4/2856/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
distillm/results/qwen1.5/spandistillm_a4/3570/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
distillm/results/qwen1.5/spandistillm_a4/714/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
distillm/results/qwen1.5/spanfdd_0.5B_1.8B/2856/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
distillm/results/qwen1.5/spanfdd_0.5B_1.8B/3570/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
distillm/results/qwen2.5/fdd_3b/1428/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
distillm/results/qwen2.5/fdd_3b/2142/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
distillm/results/qwen2.5/fdd_3b/2856/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
distillm/results/qwen2.5/fdd_3b/3570/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
distillm/results/qwen2.5/fdd_3b/714/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
distillm/results/qwen2.5/fdd_3b_old/1428/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
distillm/results/qwen2.5/fdd_3b_old/2142/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
distillm/results/qwen2.5/fdd_3b_old/2856/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
distillm/results/qwen2.5/fdd_3b_old/3570/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
distillm/results/qwen2.5/fdd_3b_old/714/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
distillm/results/qwen2.5/spanfdd_3b/1428/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
distillm/results/qwen2.5/spanfdd_3b/2142/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
distillm/results/qwen2.5/spanfdd_3b/2856/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
distillm/results/qwen2.5/spanfdd_3b/3570/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
distillm/results/qwen2.5/spanfdd_3b/714/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
distillm/results/qwen2.5/spanfdd_3b_old/1428/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 79 |
+
distillm/results/qwen2.5/spanfdd_3b_old/714/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 80 |
+
distillm/results/qwen3-8B-lora/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/config.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_function": "gelu_new",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"GPT2LMHeadModel"
|
| 5 |
+
],
|
| 6 |
+
"attn_pdrop": 0.1,
|
| 7 |
+
"bos_token_id": 50256,
|
| 8 |
+
"embd_pdrop": 0.1,
|
| 9 |
+
"eos_token_id": 50256,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"is_model_parallel": false,
|
| 12 |
+
"layer_norm_epsilon": 1e-05,
|
| 13 |
+
"model_type": "gpt2",
|
| 14 |
+
"n_ctx": 1024,
|
| 15 |
+
"n_embd": 768,
|
| 16 |
+
"n_head": 12,
|
| 17 |
+
"n_inner": null,
|
| 18 |
+
"n_layer": 12,
|
| 19 |
+
"n_positions": 1024,
|
| 20 |
+
"reorder_and_upcast_attn": false,
|
| 21 |
+
"resid_pdrop": 0.1,
|
| 22 |
+
"scale_attn_by_inverse_layer_idx": false,
|
| 23 |
+
"scale_attn_weights": true,
|
| 24 |
+
"summary_activation": null,
|
| 25 |
+
"summary_first_dropout": 0.1,
|
| 26 |
+
"summary_proj_to_labels": true,
|
| 27 |
+
"summary_type": "cls_index",
|
| 28 |
+
"summary_use_proj": true,
|
| 29 |
+
"task_specific_params": {
|
| 30 |
+
"text-generation": {
|
| 31 |
+
"do_sample": true,
|
| 32 |
+
"max_length": 50
|
| 33 |
+
}
|
| 34 |
+
},
|
| 35 |
+
"torch_dtype": "float16",
|
| 36 |
+
"transformers_version": "4.52.4",
|
| 37 |
+
"use_cache": true,
|
| 38 |
+
"vocab_size": 50257
|
| 39 |
+
}
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 50256,
|
| 4 |
+
"eos_token_id": 50256,
|
| 5 |
+
"transformers_version": "4.52.4"
|
| 6 |
+
}
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb9a751d7d1e4f0ca34f60d512cab63a9df88d57ce55e0a3b72f9a3e588c587c
|
| 3 |
+
size 248898556
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<|endoftext|>",
|
| 3 |
+
"eos_token": "<|endoftext|>",
|
| 4 |
+
"pad_token": "<|endoftext|>",
|
| 5 |
+
"unk_token": "<|endoftext|>"
|
| 6 |
+
}
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/tokenizer_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"50256": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"bos_token": "<|endoftext|>",
|
| 14 |
+
"clean_up_tokenization_spaces": false,
|
| 15 |
+
"eos_token": "<|endoftext|>",
|
| 16 |
+
"extra_special_tokens": {},
|
| 17 |
+
"model_max_length": 1024,
|
| 18 |
+
"pad_token": "<|endoftext|>",
|
| 19 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 20 |
+
"unk_token": "<|endoftext|>"
|
| 21 |
+
}
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/2856/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/config.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_function": "gelu_new",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"GPT2LMHeadModel"
|
| 5 |
+
],
|
| 6 |
+
"attn_pdrop": 0.1,
|
| 7 |
+
"bos_token_id": 50256,
|
| 8 |
+
"embd_pdrop": 0.1,
|
| 9 |
+
"eos_token_id": 50256,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"is_model_parallel": false,
|
| 12 |
+
"layer_norm_epsilon": 1e-05,
|
| 13 |
+
"model_type": "gpt2",
|
| 14 |
+
"n_ctx": 1024,
|
| 15 |
+
"n_embd": 768,
|
| 16 |
+
"n_head": 12,
|
| 17 |
+
"n_inner": null,
|
| 18 |
+
"n_layer": 12,
|
| 19 |
+
"n_positions": 1024,
|
| 20 |
+
"reorder_and_upcast_attn": false,
|
| 21 |
+
"resid_pdrop": 0.1,
|
| 22 |
+
"scale_attn_by_inverse_layer_idx": false,
|
| 23 |
+
"scale_attn_weights": true,
|
| 24 |
+
"summary_activation": null,
|
| 25 |
+
"summary_first_dropout": 0.1,
|
| 26 |
+
"summary_proj_to_labels": true,
|
| 27 |
+
"summary_type": "cls_index",
|
| 28 |
+
"summary_use_proj": true,
|
| 29 |
+
"task_specific_params": {
|
| 30 |
+
"text-generation": {
|
| 31 |
+
"do_sample": true,
|
| 32 |
+
"max_length": 50
|
| 33 |
+
}
|
| 34 |
+
},
|
| 35 |
+
"torch_dtype": "float16",
|
| 36 |
+
"transformers_version": "4.52.4",
|
| 37 |
+
"use_cache": true,
|
| 38 |
+
"vocab_size": 50257
|
| 39 |
+
}
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 50256,
|
| 4 |
+
"eos_token_id": 50256,
|
| 5 |
+
"transformers_version": "4.52.4"
|
| 6 |
+
}
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2691ddfd76e6752faa3161d05c82b68cff673059d8607e8772522cb39e1e568
|
| 3 |
+
size 248898556
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<|endoftext|>",
|
| 3 |
+
"eos_token": "<|endoftext|>",
|
| 4 |
+
"pad_token": "<|endoftext|>",
|
| 5 |
+
"unk_token": "<|endoftext|>"
|
| 6 |
+
}
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/tokenizer_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"50256": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"bos_token": "<|endoftext|>",
|
| 14 |
+
"clean_up_tokenization_spaces": false,
|
| 15 |
+
"eos_token": "<|endoftext|>",
|
| 16 |
+
"extra_special_tokens": {},
|
| 17 |
+
"model_max_length": 1024,
|
| 18 |
+
"pad_token": "<|endoftext|>",
|
| 19 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 20 |
+
"unk_token": "<|endoftext|>"
|
| 21 |
+
}
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/3570/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/args.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"model_path": "openai-community/gpt2", "ckpt_name": "gpt2-base", "model_type": "gpt2", "teacher_model_type": null, "n_gpu": 1, "n_nodes": 1, "teacher_model_path": "MiniLLM/teacher-gpt2-1.5B", "teacher_ckpt_name": "xlarge-sft", "teacher_model_fp16": true, "model_parallel": false, "model_parallel_size": null, "no_value": false, "dropout_path_rate": null, "fp32": false, "bf16": false, "type": "adaptive-srkl", "do_train": true, "do_valid": true, "do_eval": false, "base_path": "./distillm-master", "load": null, "save": "./distillm-master/results/gpt2/train/distill_0.1B_1.5B_on_srkl", "log_interval": 4, "mid_log_num": -1, "save_interval": -1, "eval_interval": -1, "local_rank": 0, "save_additional_suffix": "", "save_rollout": false, "eb_sample_times": 3, "data_dir": "./distillm-master/processed_data/dolly/full/gpt2/", "processed_data_dir": null, "force_process": false, "force_process_demo": false, "data_process_workers": -1, "train_num": -1, "train_ratio": 1, "dev_num": 1000, "dev_ratio": 1, "gen_num": -1, "data_names": null, "prompt_type": null, "num_workers": 1, "max_prompt_length": 128, "min_prompt_length": 128, "json_data": false, "bin_data": false, "txt_data": false, "prompt_data_dir": null, "lm_data_dir": null, "eval_ppl": false, "eval_rw": false, "eval_gen": true, "only_prompt": false, "batch_size": 8, "eval_batch_size": 64, "clip_grad": 1.0, "total_iters": null, "train_iters_per_epoch": -1, "max_length": 256, "seed": 42, "seed_order": 42, "seed_data": 42, "seed_ppo": 42, "seed_lm": 7, "epochs": 5, "training_epochs": 10000, "gradient_accumulation_steps": 2, "gradient_checkpointing": false, "attn_dtype": null, "lr": 0.0001, "lr_min": 1e-07, "weight_decay": 0.01, "loss_scale": 65536, "kd_ratio": 1.0, "warmup_iters": 0, "warmup_ratio": 0.0, "lr_decay_iters": null, "lr_decay_style": "cosine", "scheduler_name": "constant_trm", "w_span_loss": 1.0, "reward_scaling": null, "cliprange_reward": 1, "ppo_epochs": null, "num_rollouts": 256, "num_rollouts_per_device": null, "cliprange": 0.2, "chunk_size": null, "gamma": 0.95, "length_norm": false, "single_step_reg": false, "teacher_mixed_alpha": null, "lm_coef": 1, "skew_alpha": 0.1, "student_gen": true, "gen_top_p": 1.0, "gen_num_beams": 1, "mixed_alpha": 0.5, "loss_eps": 0.1, "init_threshold": 0.0, "capacity": 1000, "replay_ratio": "decreasing", "student_layer_mapping": [-1], "teacher_layer_mapping": [-1], "split_layer_mapping": [0, 0, 0, 0], "top_k": 0, "top_p": 1.0, "do_sample": true, "no_repeat_ngram_size": 6, "repetition_penalty": null, "num_beams": 1, "temperature": 1.0, "peft": null, "peft_lora_r": 16, "peft_lora_alpha": 64, "peft_lora_dropout": 0.1, "peft_name": null, "peft_path": null, "teacher_peft_name": null, "teacher_peft_path": null, "deepspeed": true, "deepspeed_config": "./distillm-master/configs/deepspeed/ds_config.json", "deepscale": false, "deepscale_config": null, "rank": 0, "world_size": 1}
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/eval/0/answers.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/eval/1/answers.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/eval/2/answers.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/eval/3/answers.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/eval/4/answers.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/distill_0.1B_1.5B_on_srkl/log.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/config.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_function": "gelu_new",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"GPT2LMHeadModel"
|
| 5 |
+
],
|
| 6 |
+
"attn_pdrop": 0.1,
|
| 7 |
+
"bos_token_id": 50256,
|
| 8 |
+
"embd_pdrop": 0.1,
|
| 9 |
+
"eos_token_id": 50256,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"is_model_parallel": false,
|
| 12 |
+
"layer_norm_epsilon": 1e-05,
|
| 13 |
+
"model_type": "gpt2",
|
| 14 |
+
"n_ctx": 1024,
|
| 15 |
+
"n_embd": 768,
|
| 16 |
+
"n_head": 12,
|
| 17 |
+
"n_inner": null,
|
| 18 |
+
"n_layer": 12,
|
| 19 |
+
"n_positions": 1024,
|
| 20 |
+
"reorder_and_upcast_attn": false,
|
| 21 |
+
"resid_pdrop": 0.1,
|
| 22 |
+
"scale_attn_by_inverse_layer_idx": false,
|
| 23 |
+
"scale_attn_weights": true,
|
| 24 |
+
"summary_activation": null,
|
| 25 |
+
"summary_first_dropout": 0.1,
|
| 26 |
+
"summary_proj_to_labels": true,
|
| 27 |
+
"summary_type": "cls_index",
|
| 28 |
+
"summary_use_proj": true,
|
| 29 |
+
"task_specific_params": {
|
| 30 |
+
"text-generation": {
|
| 31 |
+
"do_sample": true,
|
| 32 |
+
"max_length": 50
|
| 33 |
+
}
|
| 34 |
+
},
|
| 35 |
+
"torch_dtype": "float16",
|
| 36 |
+
"transformers_version": "4.52.4",
|
| 37 |
+
"use_cache": true,
|
| 38 |
+
"vocab_size": 50257
|
| 39 |
+
}
|
distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 50256,
|
| 4 |
+
"eos_token_id": 50256,
|
| 5 |
+
"transformers_version": "4.52.4"
|
| 6 |
+
}
|
distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4491cd1e5268a437f0bcf832a4668b6c224d208a029fc4dabee8d8f26f987e4
|
| 3 |
+
size 248898556
|
distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<|endoftext|>",
|
| 3 |
+
"eos_token": "<|endoftext|>",
|
| 4 |
+
"pad_token": "<|endoftext|>",
|
| 5 |
+
"unk_token": "<|endoftext|>"
|
| 6 |
+
}
|
distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/tokenizer_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"50256": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"bos_token": "<|endoftext|>",
|
| 14 |
+
"clean_up_tokenization_spaces": false,
|
| 15 |
+
"eos_token": "<|endoftext|>",
|
| 16 |
+
"extra_special_tokens": {},
|
| 17 |
+
"model_max_length": 1024,
|
| 18 |
+
"pad_token": "<|endoftext|>",
|
| 19 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 20 |
+
"unk_token": "<|endoftext|>"
|
| 21 |
+
}
|
distillm/results/gpt2/train/fdd_0.1B_1.5B/3570/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/fdd_0.1B_1.5B/args.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"model_path": "openai-community/gpt2", "ckpt_name": "gpt2-base", "model_type": "gpt2", "teacher_model_type": null, "n_gpu": 1, "n_nodes": 1, "teacher_model_path": "MiniLLM/teacher-gpt2-1.5B", "teacher_ckpt_name": "xlarge-sft", "teacher_model_fp16": true, "model_parallel": false, "model_parallel_size": null, "no_value": false, "dropout_path_rate": null, "fp32": false, "bf16": false, "type": "adaptive-srkl", "do_train": true, "do_valid": true, "do_eval": false, "base_path": "./distillm-master", "load": null, "save": "./distillm-master/results/gpt2/train/fdd_0.1B_1.5B", "log_interval": 4, "mid_log_num": -1, "save_interval": -1, "eval_interval": -1, "local_rank": 0, "save_additional_suffix": "", "save_rollout": false, "eb_sample_times": 3, "data_dir": "./distillm-master/processed_data/dolly/full/gpt2/", "processed_data_dir": null, "force_process": false, "force_process_demo": false, "data_process_workers": -1, "train_num": -1, "train_ratio": 1, "dev_num": 1000, "dev_ratio": 1, "gen_num": -1, "data_names": null, "prompt_type": null, "num_workers": 1, "max_prompt_length": 128, "min_prompt_length": 128, "json_data": false, "bin_data": false, "txt_data": false, "prompt_data_dir": null, "lm_data_dir": null, "eval_ppl": false, "eval_rw": false, "eval_gen": true, "only_prompt": false, "batch_size": 16, "eval_batch_size": 64, "clip_grad": 1.0, "total_iters": null, "train_iters_per_epoch": -1, "max_length": 256, "seed": 42, "seed_order": 42, "seed_data": 42, "seed_ppo": 42, "seed_lm": 7, "epochs": 5, "training_epochs": 10000, "gradient_accumulation_steps": 1, "gradient_checkpointing": false, "attn_dtype": null, "lr": 0.0001, "lr_min": 1e-07, "weight_decay": 0.01, "loss_scale": 65536, "kd_ratio": 1.0, "warmup_iters": 0, "warmup_ratio": 0.1, "lr_decay_iters": null, "lr_decay_style": "cosine", "scheduler_name": "constant_trm", "w_span_loss": 1.0, "reward_scaling": null, "cliprange_reward": 1, "ppo_epochs": null, "num_rollouts": 256, "num_rollouts_per_device": null, "cliprange": 0.2, "chunk_size": null, "gamma": 0.95, "length_norm": false, "single_step_reg": false, "teacher_mixed_alpha": null, "lm_coef": 1, "skew_alpha": 0.1, "student_gen": true, "gen_top_p": 1.0, "gen_num_beams": 1, "mixed_alpha": 0.5, "loss_eps": 0.1, "init_threshold": 0.0, "capacity": 1000, "replay_ratio": "decreasing", "student_layer_mapping": [4, 6, 8], "teacher_layer_mapping": [16, 24, 32], "split_layer_mapping": [0, 0, 0, 0], "top_k": 0, "top_p": 1.0, "do_sample": true, "no_repeat_ngram_size": 6, "repetition_penalty": null, "num_beams": 1, "temperature": 1.0, "peft": null, "peft_lora_r": 16, "peft_lora_alpha": 64, "peft_lora_dropout": 0.1, "peft_name": null, "peft_path": null, "teacher_peft_name": null, "teacher_peft_path": null, "deepspeed": true, "deepspeed_config": "./distillm-master/configs/deepspeed/ds_config.json", "deepscale": false, "deepscale_config": null, "rank": 0, "world_size": 1}
|
distillm/results/gpt2/train/fdd_0.1B_1.5B/log.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/config.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_function": "gelu_new",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"GPT2LMHeadModel"
|
| 5 |
+
],
|
| 6 |
+
"attn_pdrop": 0.1,
|
| 7 |
+
"bos_token_id": 50256,
|
| 8 |
+
"embd_pdrop": 0.1,
|
| 9 |
+
"eos_token_id": 50256,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"is_model_parallel": false,
|
| 12 |
+
"layer_norm_epsilon": 1e-05,
|
| 13 |
+
"model_type": "gpt2",
|
| 14 |
+
"n_ctx": 1024,
|
| 15 |
+
"n_embd": 768,
|
| 16 |
+
"n_head": 12,
|
| 17 |
+
"n_inner": null,
|
| 18 |
+
"n_layer": 12,
|
| 19 |
+
"n_positions": 1024,
|
| 20 |
+
"reorder_and_upcast_attn": false,
|
| 21 |
+
"resid_pdrop": 0.1,
|
| 22 |
+
"scale_attn_by_inverse_layer_idx": false,
|
| 23 |
+
"scale_attn_weights": true,
|
| 24 |
+
"summary_activation": null,
|
| 25 |
+
"summary_first_dropout": 0.1,
|
| 26 |
+
"summary_proj_to_labels": true,
|
| 27 |
+
"summary_type": "cls_index",
|
| 28 |
+
"summary_use_proj": true,
|
| 29 |
+
"task_specific_params": {
|
| 30 |
+
"text-generation": {
|
| 31 |
+
"do_sample": true,
|
| 32 |
+
"max_length": 50
|
| 33 |
+
}
|
| 34 |
+
},
|
| 35 |
+
"torch_dtype": "float16",
|
| 36 |
+
"transformers_version": "4.52.4",
|
| 37 |
+
"use_cache": true,
|
| 38 |
+
"vocab_size": 50257
|
| 39 |
+
}
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 50256,
|
| 4 |
+
"eos_token_id": 50256,
|
| 5 |
+
"transformers_version": "4.52.4"
|
| 6 |
+
}
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b182acda70e98919c4cde9e621f3d20d9261f6467c7d59efbfe0c0dab8f8ab4
|
| 3 |
+
size 256281854
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<|endoftext|>",
|
| 3 |
+
"eos_token": "<|endoftext|>",
|
| 4 |
+
"pad_token": "<|endoftext|>",
|
| 5 |
+
"unk_token": "<|endoftext|>"
|
| 6 |
+
}
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/tokenizer_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"50256": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"bos_token": "<|endoftext|>",
|
| 14 |
+
"clean_up_tokenization_spaces": false,
|
| 15 |
+
"eos_token": "<|endoftext|>",
|
| 16 |
+
"extra_special_tokens": {},
|
| 17 |
+
"model_max_length": 1024,
|
| 18 |
+
"pad_token": "<|endoftext|>",
|
| 19 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 20 |
+
"unk_token": "<|endoftext|>"
|
| 21 |
+
}
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/config.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_function": "gelu_new",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"GPT2LMHeadModel"
|
| 5 |
+
],
|
| 6 |
+
"attn_pdrop": 0.1,
|
| 7 |
+
"bos_token_id": 50256,
|
| 8 |
+
"embd_pdrop": 0.1,
|
| 9 |
+
"eos_token_id": 50256,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"is_model_parallel": false,
|
| 12 |
+
"layer_norm_epsilon": 1e-05,
|
| 13 |
+
"model_type": "gpt2",
|
| 14 |
+
"n_ctx": 1024,
|
| 15 |
+
"n_embd": 768,
|
| 16 |
+
"n_head": 12,
|
| 17 |
+
"n_inner": null,
|
| 18 |
+
"n_layer": 12,
|
| 19 |
+
"n_positions": 1024,
|
| 20 |
+
"reorder_and_upcast_attn": false,
|
| 21 |
+
"resid_pdrop": 0.1,
|
| 22 |
+
"scale_attn_by_inverse_layer_idx": false,
|
| 23 |
+
"scale_attn_weights": true,
|
| 24 |
+
"summary_activation": null,
|
| 25 |
+
"summary_first_dropout": 0.1,
|
| 26 |
+
"summary_proj_to_labels": true,
|
| 27 |
+
"summary_type": "cls_index",
|
| 28 |
+
"summary_use_proj": true,
|
| 29 |
+
"task_specific_params": {
|
| 30 |
+
"text-generation": {
|
| 31 |
+
"do_sample": true,
|
| 32 |
+
"max_length": 50
|
| 33 |
+
}
|
| 34 |
+
},
|
| 35 |
+
"torch_dtype": "float16",
|
| 36 |
+
"transformers_version": "4.52.4",
|
| 37 |
+
"use_cache": true,
|
| 38 |
+
"vocab_size": 50257
|
| 39 |
+
}
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 50256,
|
| 4 |
+
"eos_token_id": 50256,
|
| 5 |
+
"transformers_version": "4.52.4"
|
| 6 |
+
}
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6db8d7ef0c60a83997707157123d8d53192f30546f60b0fa6d61bc25892aacb1
|
| 3 |
+
size 256281854
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/special_tokens_map.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": "<|endoftext|>",
|
| 3 |
+
"eos_token": "<|endoftext|>",
|
| 4 |
+
"pad_token": "<|endoftext|>",
|
| 5 |
+
"unk_token": "<|endoftext|>"
|
| 6 |
+
}
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/tokenizer_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"50256": {
|
| 5 |
+
"content": "<|endoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"bos_token": "<|endoftext|>",
|
| 14 |
+
"clean_up_tokenization_spaces": false,
|
| 15 |
+
"eos_token": "<|endoftext|>",
|
| 16 |
+
"extra_special_tokens": {},
|
| 17 |
+
"model_max_length": 1024,
|
| 18 |
+
"pad_token": "<|endoftext|>",
|
| 19 |
+
"tokenizer_class": "GPT2Tokenizer",
|
| 20 |
+
"unk_token": "<|endoftext|>"
|
| 21 |
+
}
|
distillm/results/gpt2/train/spandistill_0.1B_1.5B_on_v5/3570_w_2.0/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|