LH-Tech-AI commited on
Commit
b136fe1
·
verified ·
1 Parent(s): a2a0b77

Upload 2 files

Browse files
finetune-apex-1.5-to-apex-1.5-coder.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import math
4
+ import torch
5
+ from model import GPTConfig, GPT
6
+
7
+ import numpy as np
8
+
9
+ # -----------------------------------------------------------------------------
10
+ out_dir = '/media/leo/Data/checkpoints/350m_Apex_1.5_Code'
11
+ init_from_file = '/media/leo/Data/checkpoints/350m_Apex_1.5_Final_NEW_More_Anti_Forgetting/Apex_1.5_Final.pt'
12
+ dataset = 'apex_code_boost'
13
+
14
+ # Sanfte Hyperparameter gegen Catastrophic Forgetting
15
+ batch_size = 4
16
+ gradient_accumulation_steps = 32
17
+ block_size = 1024
18
+ learning_rate = 1e-5
19
+ max_iters = 1000
20
+ weight_decay = 0.1
21
+ dropout = 0.1
22
+ warmup_iters = 50
23
+ min_lr = 1e-6
24
+ beta1, beta2 = 0.9, 0.95
25
+ device = 'cuda'
26
+ dtype = 'bfloat16'
27
+ compile = True
28
+ save_interval = 500
29
+ # -----------------------------------------------------------------------------
30
+
31
+ os.makedirs(out_dir, exist_ok=True)
32
+ torch.manual_seed(1337)
33
+ device_type = 'cuda' if 'cuda' in device else 'cpu'
34
+ ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
35
+ ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype)
36
+
37
+ # Daten-Loader (Alpaca Binärdatei)
38
+ data_dir = os.path.join('data', dataset)
39
+ train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
40
+ train_mask = np.memmap(os.path.join(data_dir, 'train_mask.bin'), dtype=np.uint8, mode='r')
41
+
42
+ def get_batch():
43
+ ix = torch.randint(len(train_data) - block_size, (batch_size,))
44
+ x = torch.stack([torch.from_numpy((train_data[i:i+block_size]).astype(np.int64)) for i in ix])
45
+ y = torch.stack([torch.from_numpy((train_data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
46
+ # Maske laden (entspricht y, also um 1 verschoben)
47
+ m = torch.stack([torch.from_numpy((train_mask[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
48
+
49
+ # WICHTIG: Ersetze in y alle Stellen, wo m == 0 ist, durch -100
50
+ # PyTorch CrossEntropyLoss ignoriert -100 automatisch
51
+ y[m == 0] = -100
52
+
53
+ x, y = x.to(device), y.to(device)
54
+ return x, y
55
+
56
+ # Modell laden (zum Code-Finetuning)
57
+ print(f"📥 Lade Apex 1.5 Final als Basis...")
58
+ checkpoint = torch.load(init_from_file, map_location=device)
59
+ gptconf = GPTConfig(**checkpoint['model_args'])
60
+ model = GPT(gptconf)
61
+ state_dict = checkpoint['model']
62
+
63
+ # Fix für potenzielle 'orig_mod' Prefixe
64
+ unwanted_prefix = '_orig_mod.'
65
+ for k,v in list(state_dict.items()):
66
+ if k.startswith(unwanted_prefix):
67
+ state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
68
+
69
+ model.load_state_dict(state_dict)
70
+ model.to(device)
71
+
72
+ if compile:
73
+ print("🚀 Kompiliere Modell...")
74
+ model = torch.compile(model)
75
+
76
+ optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
77
+ scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
78
+
79
+ # LR Scheduler
80
+ def get_lr(it):
81
+ if it < warmup_iters: return learning_rate * it / warmup_iters
82
+ if it > max_iters: return min_lr
83
+ decay_ratio = (it - warmup_iters) / (max_iters - warmup_iters)
84
+ coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
85
+ return min_lr + coeff * (learning_rate - min_lr)
86
+
87
+ # Trainings-Schleife
88
+ print(f"🛠️ Starte Finetuning: Apex 1.5 lernt Coden...")
89
+ model.train()
90
+ t0 = time.time()
91
+
92
+ for iter_num in range(max_iters + 1):
93
+ lr = get_lr(iter_num)
94
+ for param_group in optimizer.param_groups:
95
+ param_group['lr'] = lr
96
+
97
+ for micro_step in range(gradient_accumulation_steps):
98
+ X, Y = get_batch()
99
+ with ctx:
100
+ logits, loss = model(X, Y)
101
+ loss = loss / gradient_accumulation_steps
102
+ scaler.scale(loss).backward()
103
+
104
+ scaler.unscale_(optimizer)
105
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
106
+ scaler.step(optimizer)
107
+ scaler.update()
108
+ optimizer.zero_grad(set_to_none=True)
109
+
110
+ if iter_num % 10 == 0:
111
+ dt = time.time() - t0
112
+ print(f"Iter {iter_num}: Loss {loss.item()*gradient_accumulation_steps:.4f}, Zeit {dt*1000:.2f}ms, LR {lr:.2e}")
113
+ t0 = time.time()
114
+
115
+ if iter_num > 0 and iter_num % save_interval == 0:
116
+ checkpoint_name = f'Apex_1.5_Code_iter_{iter_num}.pt'
117
+ save_path = os.path.join(out_dir, checkpoint_name)
118
+ print(f"💾 Speichere Zwischen-Checkpoint: {checkpoint_name}")
119
+ raw_model = model._orig_mod if compile else model
120
+ checkpoint_data = {
121
+ 'model': raw_model.state_dict(),
122
+ 'model_args': checkpoint['model_args'],
123
+ 'iter_num': iter_num,
124
+ 'lr': lr,
125
+ }
126
+ torch.save(checkpoint_data, save_path)
127
+
128
+ # Finales Speichern
129
+ print(f"💾 Finetuning beendet. Speichere Apex 1.5 Code...")
130
+ final_checkpoint = {
131
+ 'model': model.state_dict() if not compile else model._orig_mod.state_dict(),
132
+ 'model_args': checkpoint['model_args'],
133
+ 'config': checkpoint.get('config', {}),
134
+ }
135
+ torch.save(final_checkpoint, os.path.join(out_dir, 'Apex_1.5_Code_Final.pt'))
136
+ print("✅ Apex 1.5 Code wurde erfolgreich gespeichert!")
prepare-apex-1.5-coder-data.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import tiktoken
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+ import random
7
+
8
+ # --- KONFIGURATION ---
9
+ OUTPUT_DIR = "data/apex_code_boost" # Neuer Name!
10
+ TOKENIZER_NAME = "gpt2"
11
+ SEED = 1337
12
+
13
+ # Sanfte Mischung für die Nachschulung:
14
+ # Wir nehmen weniger FineWeb, damit Code mehr Gewicht bekommt
15
+ FINEWEB_SAMPLES = 50000
16
+ # Wir laden zusätzlich einen Code-Datensatz (Python Fokus)
17
+ print("📥 Lade 'sahil2801/CodeAlpaca-20k'...")
18
+ code_dataset = load_dataset("sahil2801/CodeAlpaca-20k", split='train')
19
+
20
+ enc = tiktoken.get_encoding(TOKENIZER_NAME)
21
+ EOS_TOKEN = "<|endoftext|>"
22
+
23
+ def format_prompt_with_mask(instruction, input_text, output):
24
+ if input_text and input_text.strip():
25
+ prompt_text = f"Instruction:\n{instruction}\n\nInput:\n{input_text}\n\nResponse:\n"
26
+ else:
27
+ prompt_text = f"Instruction:\n{instruction}\n\nResponse:\n"
28
+ completion_text = f"{output}{EOS_TOKEN}"
29
+ prompt_ids = enc.encode(prompt_text, allowed_special={'<|endoftext|>'})
30
+ completion_ids = enc.encode(completion_text, allowed_special={'<|endoftext|>'})
31
+ full_ids = prompt_ids + completion_ids
32
+ mask = [0] * len(prompt_ids) + [1] * len(completion_ids)
33
+ return full_ids, mask
34
+
35
+ def main():
36
+ np.random.seed(SEED)
37
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
38
+
39
+ alpaca = load_dataset("yahma/alpaca-cleaned", split='train')
40
+ fineweb = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split='train', streaming=True)
41
+
42
+ all_samples = []
43
+
44
+ # 1. Alpaca verarbeiten
45
+ for ex in tqdm(alpaca, desc="Alpaca"):
46
+ all_samples.append(format_prompt_with_mask(ex['instruction'], ex['input'], ex['output']))
47
+
48
+ # 2. Code-Alpaca verarbeiten (WICHTIG!)
49
+ for ex in tqdm(code_dataset, desc="Code-Alpaca"):
50
+ all_samples.append(format_prompt_with_mask(ex['instruction'], ex['input'], ex['output']))
51
+
52
+ # 3. FineWeb verarbeiten (Wissenserhalt)
53
+ fw_iter = iter(fineweb)
54
+ for _ in tqdm(range(FINEWEB_SAMPLES), desc="FineWeb"):
55
+ try:
56
+ ex = next(fw_iter)
57
+ text = ex['text'] + EOS_TOKEN
58
+ ids = enc.encode(text, allowed_special={EOS_TOKEN})
59
+ all_samples.append((ids, [1] * len(ids)))
60
+ except StopIteration:
61
+ break
62
+
63
+ # SHUFFLE für Anti-Forgetting
64
+ random.seed(SEED)
65
+ random.shuffle(all_samples)
66
+
67
+ all_tokens = []
68
+ all_masks = []
69
+ for ids, mask in all_samples:
70
+ all_tokens.extend(ids)
71
+ all_masks.extend(mask)
72
+
73
+ # Speichern
74
+ print(f"💾 Speichere in '{OUTPUT_DIR}'...")
75
+ np.array(all_tokens, dtype=np.uint16).tofile(os.path.join(OUTPUT_DIR, "train.bin"))
76
+ np.array(all_masks, dtype=np.uint8).tofile(os.path.join(OUTPUT_DIR, "train_mask.bin"))
77
+ print("✅ Datensatz für Code-Boost fertig!")
78
+
79
+ if __name__ == "__main__":
80
+ main()