narySt commited on
Commit
8964978
·
verified ·
1 Parent(s): 9d840b4

Delete wandb

Browse files
wandb/debug-internal.log DELETED
@@ -1,13 +0,0 @@
1
- {"time":"2026-04-18T12:19:16.854153102Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
- {"time":"2026-04-18T12:19:17.275622627Z","level":"INFO","msg":"stream: created new stream","id":"2mk39j3k"}
3
- {"time":"2026-04-18T12:19:17.275728468Z","level":"INFO","msg":"handler: started","stream_id":"2mk39j3k"}
4
- {"time":"2026-04-18T12:19:17.27585918Z","level":"INFO","msg":"stream: started","id":"2mk39j3k"}
5
- {"time":"2026-04-18T12:19:17.275907737Z","level":"INFO","msg":"writer: started","stream_id":"2mk39j3k"}
6
- {"time":"2026-04-18T12:19:17.275922617Z","level":"INFO","msg":"sender: started","stream_id":"2mk39j3k"}
7
- {"time":"2026-04-18T12:19:17.416506096Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
- {"time":"2026-04-18T16:22:36.413732384Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
- {"time":"2026-04-18T16:22:36.528364306Z","level":"INFO","msg":"handler: operation stats","stats":{}}
10
- {"time":"2026-04-18T16:22:36.53202978Z","level":"INFO","msg":"stream: closing","id":"2mk39j3k"}
11
- {"time":"2026-04-18T16:22:36.532057065Z","level":"INFO","msg":"handler: closed","stream_id":"2mk39j3k"}
12
- {"time":"2026-04-18T16:22:36.532311468Z","level":"INFO","msg":"sender: closed","stream_id":"2mk39j3k"}
13
- {"time":"2026-04-18T16:22:36.532328723Z","level":"INFO","msg":"stream: closed","id":"2mk39j3k"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/debug.log DELETED
@@ -1,24 +0,0 @@
1
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_setup.py:_flush():81] Configure stats pid to 4105
3
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_init.py:setup_run_log_directory():717] Logging user logs to outputs/2026-04-18/12-19-14/wandb/run-20260418_121916-2mk39j3k/logs/debug.log
5
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to outputs/2026-04-18/12-19-14/wandb/run-20260418_121916-2mk39j3k/logs/debug-internal.log
6
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_init.py:init():844] calling init triggers
7
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
- config: {'model': {'name': 'EleutherAI/pythia-1.4b', 'checkpoint_path': None, 'from_scratch': False}, 'training': {'epochs': 3, 'batch_size': 4, 'eval_batch_size': 12, 'gradient_accumulation_steps': 4, 'lr': 2e-05, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 4, 'pin_memory': True}, 'logging': {'log_interval': 10, 'save_interval': 3000, 'eval_interval': 1000, 'save_every_epoch': True}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'code-completion_full', 'run_name': 'pythia_1_4b_v4_lr_2e-5', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': 'outputs/2026-04-18/12-19-14'}, 'paths': {'output_dir': 'outputs/2026-04-18/12-19-14'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/code_completion_exp/train_pythia/train.py'}}
9
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_init.py:init():892] starting backend
10
- 2026-04-18 12:19:16,824 INFO MainThread:4105 [wandb_init.py:init():895] sending inform_init request
11
- 2026-04-18 12:19:16,852 INFO MainThread:4105 [wandb_init.py:init():903] backend started and connected
12
- 2026-04-18 12:19:16,858 INFO MainThread:4105 [wandb_init.py:init():973] updated telemetry
13
- 2026-04-18 12:19:16,890 INFO MainThread:4105 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
- 2026-04-18 12:19:17,414 INFO MainThread:4105 [wandb_init.py:init():1044] starting run threads in backend
15
- 2026-04-18 12:19:17,567 INFO MainThread:4105 [wandb_run.py:_console_start():2529] atexit reg
16
- 2026-04-18 12:19:17,568 INFO MainThread:4105 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
- 2026-04-18 12:19:17,568 INFO MainThread:4105 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
- 2026-04-18 12:19:17,568 INFO MainThread:4105 [wandb_run.py:_redirect():2469] Redirects installed.
19
- 2026-04-18 12:19:17,571 INFO MainThread:4105 [wandb_init.py:init():1084] run started, returning control to user process
20
- 2026-04-18 16:22:34,834 INFO MainThread:4105 [wandb_run.py:_finish():2295] finishing run nikita/code-completion_full/2mk39j3k
21
- 2026-04-18 16:22:34,835 INFO MainThread:4105 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
- 2026-04-18 16:22:34,835 INFO MainThread:4105 [wandb_run.py:_restore():2476] restore
23
- 2026-04-18 16:22:34,835 INFO MainThread:4105 [wandb_run.py:_restore():2482] restore done
24
- 2026-04-18 16:22:36,531 INFO MainThread:4105 [wandb_run.py:_footer_sync_info():3870] logging synced files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20260418_121916-2mk39j3k/files/code/code_completion_exp/train_pythia/train.py DELETED
@@ -1,598 +0,0 @@
1
- """
2
- Training Pipeline для Pythia (decoder-only transformer) на задаче Code Completion.
3
-
4
- Конфигурация через Hydra + OmegaConf, логирование в Trackio.
5
- Поддержка DDP через Accelerate для multi-GPU тренировки.
6
-
7
- Использование:
8
- # Базовый запуск (single GPU)
9
- python train.py
10
-
11
- # Multi-GPU с Accelerate
12
- accelerate launch train.py
13
-
14
- # Multi-GPU с указанием количества GPU
15
- accelerate launch --num_processes=4 train.py
16
-
17
- # Переопределение параметров через CLI
18
- python train.py training.lr=1e-4 training.epochs=5
19
-
20
- # Выбор другого конфига модели
21
- python train.py model=pythia_160m
22
-
23
- # Multirun (sweep)
24
- python train.py --multirun training.lr=1e-4,3e-4,1e-3
25
-
26
- # Без логирования
27
- python train.py tracking.enabled=false
28
- """
29
-
30
- import os
31
- import math
32
- import time
33
- from pathlib import Path
34
-
35
- import torch
36
- import torch.nn as nn
37
- import torch.nn.functional as F
38
- from torch.utils.data import DataLoader
39
- from datasets import load_from_disk
40
-
41
- import hydra
42
- from hydra.core.hydra_config import HydraConfig
43
- from omegaconf import DictConfig, OmegaConf
44
- from transformers import (
45
- AutoTokenizer,
46
- AutoModelForCausalLM,
47
- AutoConfig,
48
- PreTrainedTokenizerBase,
49
- )
50
- from accelerate import Accelerator
51
- from accelerate.utils import set_seed as accelerate_set_seed
52
-
53
- # Ensure repo root is on sys.path (needed when running from subdirectory)
54
- import sys
55
- sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
56
-
57
- # Shared training library
58
- from training_lib.utils import AverageMeter, log_message
59
- from training_lib.checkpointing import save_checkpoint, load_checkpoint
60
- from training_lib.schedulers import get_lr_scheduler
61
- from training_lib.tracking import init_tracking, log_metrics, finish_tracking
62
- from training_lib.validation import run_validation
63
-
64
-
65
- # ============================================================================
66
- # ДАННЫЕ
67
- # ============================================================================
68
-
69
-
70
- class CodeCompletionCollator:
71
- """Collate function для батчирования примеров code completion."""
72
-
73
- def __init__(
74
- self,
75
- tokenizer: PreTrainedTokenizerBase,
76
- max_context_len: int = 1024,
77
- max_target_len: int = 256,
78
- ):
79
- self.tokenizer = tokenizer
80
- self.max_context_len = max_context_len
81
- self.max_target_len = max_target_len
82
- self.pad_token_id = tokenizer.pad_token_id
83
-
84
- def __call__(self, batch: list[dict]) -> dict:
85
- contexts = [item["context"] for item in batch]
86
- targets = [item["target"] for item in batch]
87
-
88
- encoded_contexts = self.tokenizer(
89
- contexts,
90
- add_special_tokens=True,
91
- truncation=True,
92
- max_length=self.max_context_len,
93
- return_tensors=None,
94
- )
95
- encoded_targets = self.tokenizer(
96
- targets,
97
- add_special_tokens=False,
98
- truncation=True,
99
- max_length=self.max_target_len,
100
- return_tensors=None,
101
- )
102
-
103
- input_ids_list = []
104
- context_lengths = []
105
-
106
- for ctx_ids, tgt_ids in zip(
107
- encoded_contexts["input_ids"], encoded_targets["input_ids"]
108
- ):
109
- tgt_ids = tgt_ids + [self.tokenizer.eos_token_id]
110
- context_lengths.append(len(ctx_ids))
111
- input_ids_list.append(ctx_ids + tgt_ids)
112
-
113
- max_len = max(len(ids) for ids in input_ids_list)
114
-
115
- padded_input_ids = []
116
- attention_mask = []
117
-
118
- for ids in input_ids_list:
119
- padding_len = max_len - len(ids)
120
- padded_input_ids.append(ids + [self.pad_token_id] * padding_len)
121
- attention_mask.append([1] * len(ids) + [0] * padding_len)
122
-
123
- return {
124
- "input_ids": torch.tensor(padded_input_ids, dtype=torch.long),
125
- "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
126
- "context_lengths": torch.tensor(context_lengths, dtype=torch.long),
127
- }
128
-
129
-
130
- def create_dataloaders(
131
- cfg: DictConfig, tokenizer: PreTrainedTokenizerBase
132
- ) -> dict[str, DataLoader]:
133
- """Создание DataLoader'ов для train и validation."""
134
- dataset_dict = load_from_disk(cfg.data.path)
135
-
136
- collator = CodeCompletionCollator(
137
- tokenizer=tokenizer,
138
- max_context_len=cfg.data.max_context_len,
139
- max_target_len=cfg.data.max_target_len,
140
- )
141
-
142
- dataloaders = {}
143
-
144
- if "train" in dataset_dict:
145
- dataloaders["train"] = DataLoader(
146
- dataset_dict["train"],
147
- batch_size=cfg.training.batch_size,
148
- shuffle=True,
149
- collate_fn=collator,
150
- num_workers=cfg.data.num_workers,
151
- pin_memory=cfg.data.pin_memory,
152
- )
153
-
154
- if "validation" in dataset_dict:
155
- eval_batch_size = cfg.training.get("eval_batch_size", cfg.training.batch_size)
156
- dataloaders["validation"] = DataLoader(
157
- dataset_dict["validation"],
158
- batch_size=eval_batch_size,
159
- shuffle=False,
160
- collate_fn=collator,
161
- num_workers=cfg.data.num_workers,
162
- pin_memory=cfg.data.pin_memory,
163
- )
164
-
165
- return dataloaders
166
-
167
-
168
-
169
-
170
- # ============================================================================
171
- # LOSS ФУНКЦИИ
172
- # ============================================================================
173
-
174
-
175
- def compute_loss(
176
- logits: torch.Tensor,
177
- input_ids: torch.Tensor,
178
- context_lengths: torch.Tensor,
179
- attention_mask: torch.Tensor,
180
- ) -> dict:
181
- """Вычисление loss для авторегрессионной модели."""
182
- batch_size, seq_len, vocab_size = logits.shape
183
-
184
- shift_logits = logits[:, :-1, :].contiguous()
185
- shift_labels = input_ids[:, 1:].contiguous()
186
- shift_mask = attention_mask[:, 1:].contiguous()
187
-
188
- target_mask = torch.zeros_like(shift_labels, dtype=torch.bool)
189
- for i in range(batch_size):
190
- ctx_len = context_lengths[i].item()
191
- target_mask[i, ctx_len - 1 :] = True
192
-
193
- final_mask = target_mask & shift_mask.bool()
194
-
195
- if final_mask.sum() > 0:
196
- loss = F.cross_entropy(
197
- shift_logits[final_mask], shift_labels[final_mask], reduction="mean"
198
- )
199
- else:
200
- loss = torch.tensor(0.0, device=logits.device)
201
-
202
- return {"loss": loss}
203
-
204
-
205
- def _pythia_forward_loss(
206
- model: nn.Module,
207
- batch: dict,
208
- cfg: DictConfig,
209
- accelerator: Accelerator,
210
- ) -> dict:
211
- """Forward + loss for a plain HF causal LM (attention_mask= kwarg, .logits)."""
212
- input_ids = batch["input_ids"]
213
- attention_mask = batch["attention_mask"]
214
- context_lengths = batch["context_lengths"]
215
- output = model(input_ids, attention_mask=attention_mask)
216
- return compute_loss(output.logits, input_ids, context_lengths, attention_mask)
217
-
218
-
219
- # ============================================================================
220
- # PARAMETER GROUPING
221
- # ============================================================================
222
-
223
-
224
- def group_params(model: nn.Module, weight_decay: float) -> list[dict]:
225
- """Группировка параметров для optimizer."""
226
- decay_params = []
227
- no_decay_params = []
228
-
229
- for name, param in model.named_parameters():
230
- if not param.requires_grad:
231
- continue
232
-
233
- if "bias" in name or "LayerNorm" in name or "layernorm" in name:
234
- no_decay_params.append(param)
235
- else:
236
- decay_params.append(param)
237
-
238
- return [
239
- {"params": decay_params, "weight_decay": weight_decay},
240
- {"params": no_decay_params, "weight_decay": 0.0},
241
- ]
242
-
243
-
244
-
245
-
246
- # ============================================================================
247
- # TRAINING LOOP
248
- # ============================================================================
249
-
250
-
251
- def train_epoch(
252
- model: nn.Module,
253
- dataloader: DataLoader,
254
- optimizer: torch.optim.Optimizer,
255
- scheduler,
256
- cfg: DictConfig,
257
- epoch: int,
258
- global_step: int,
259
- accelerator: Accelerator,
260
- val_dataloader: DataLoader | None = None,
261
- best_val_loss: float = float("inf"),
262
- ) -> tuple[int, float]:
263
- """Один epoch тренировки. Возвращает (global_step, best_val_loss)."""
264
- model.train()
265
-
266
- loss_meter = AverageMeter()
267
-
268
- optimizer.zero_grad()
269
- accumulated_loss = 0.0
270
- accumulated_steps = 0
271
-
272
- epoch_start_time = time.time()
273
- step_start_time = time.time()
274
-
275
- for batch_idx, batch in enumerate(dataloader):
276
- input_ids = batch["input_ids"]
277
- attention_mask = batch["attention_mask"]
278
- context_lengths = batch["context_lengths"]
279
-
280
- with accelerator.autocast():
281
- output = model(input_ids, attention_mask=attention_mask)
282
- logits = output.logits
283
- loss_dict = compute_loss(
284
- logits, input_ids, context_lengths, attention_mask
285
- )
286
-
287
- loss = loss_dict["loss"] / cfg.training.gradient_accumulation_steps
288
- accelerator.backward(loss)
289
-
290
- accumulated_loss += loss_dict["loss"].item()
291
- accumulated_steps += 1
292
-
293
- if accumulated_steps == cfg.training.gradient_accumulation_steps:
294
- if cfg.training.max_grad_norm > 0:
295
- accelerator.clip_grad_norm_(
296
- model.parameters(), cfg.training.max_grad_norm
297
- )
298
-
299
- optimizer.step()
300
- scheduler.step()
301
- optimizer.zero_grad()
302
-
303
- avg_loss = accumulated_loss / cfg.training.gradient_accumulation_steps
304
- loss_meter.update(avg_loss)
305
-
306
- global_step += 1
307
-
308
- if global_step % cfg.logging.log_interval == 0:
309
- step_time = time.time() - step_start_time
310
- current_lr = scheduler.get_last_lr()[0]
311
-
312
- metrics = {
313
- "train/loss": loss_meter.val,
314
- "train/loss_avg": loss_meter.avg,
315
- "train/lr": current_lr,
316
- "train/epoch": epoch,
317
- "train/step_time": step_time / cfg.logging.log_interval,
318
- }
319
-
320
- log_metrics(metrics, step=global_step)
321
-
322
- log_message(
323
- f"Epoch {epoch} | Step {global_step} | "
324
- f"Loss: {loss_meter.avg:.4f} | "
325
- f"LR: {current_lr:.2e}",
326
- cfg,
327
- accelerator,
328
- )
329
-
330
- step_start_time = time.time()
331
-
332
- if (
333
- cfg.logging.save_interval > 0
334
- and global_step % cfg.logging.save_interval == 0
335
- ):
336
- save_checkpoint(
337
- model, optimizer, scheduler, global_step, epoch, cfg, accelerator
338
- )
339
-
340
- eval_interval = cfg.logging.get("eval_interval", 0)
341
- if (
342
- eval_interval > 0
343
- and val_dataloader is not None
344
- and global_step % eval_interval == 0
345
- ):
346
- val_metrics = run_validation(
347
- model=model,
348
- dataloader=val_dataloader,
349
- cfg=cfg,
350
- global_step=global_step,
351
- accelerator=accelerator,
352
- forward_loss_fn=_pythia_forward_loss,
353
- )
354
-
355
- if val_metrics["val/loss"] < best_val_loss:
356
- best_val_loss = val_metrics["val/loss"]
357
- if accelerator.is_main_process:
358
- best_model_path = Path(cfg.paths.output_dir) / "model_best.pt"
359
- unwrapped_model = accelerator.unwrap_model(model)
360
- torch.save(unwrapped_model.state_dict(), best_model_path)
361
- log_message(
362
- f"New best model saved! Val loss: {best_val_loss:.4f}",
363
- cfg,
364
- accelerator
365
- )
366
-
367
- log_metrics(
368
- {
369
- "best/val_loss": best_val_loss,
370
- "best/val_perplexity": val_metrics["val/perplexity"],
371
- "best/step": global_step,
372
- },
373
- step=global_step,
374
- )
375
-
376
- model.train()
377
-
378
- accumulated_loss = 0.0
379
- accumulated_steps = 0
380
-
381
- epoch_time = time.time() - epoch_start_time
382
-
383
- log_message(
384
- f"Epoch {epoch} completed in {epoch_time:.2f}s | "
385
- f"Loss: {loss_meter.avg:.4f}",
386
- cfg,
387
- accelerator,
388
- )
389
-
390
- log_metrics({
391
- "epoch/loss": loss_meter.avg,
392
- "epoch/time": epoch_time,
393
- })
394
-
395
- return global_step, best_val_loss
396
-
397
-
398
- # ============================================================================
399
- # MAIN
400
- # ============================================================================
401
-
402
-
403
- @hydra.main(version_base=None, config_path="configs", config_name="config")
404
- def main(cfg: DictConfig):
405
- """Главная функция тренировки с поддержкой DDP через Accelerate."""
406
-
407
- # === Performance: Enable TF32 for faster matmuls on Ampere+ GPUs ===
408
- torch.set_float32_matmul_precision('high')
409
-
410
- # === Accelerator Setup ===
411
- mixed_precision = "bf16" if cfg.training.use_amp else "no"
412
-
413
- accelerator = Accelerator(
414
- mixed_precision=mixed_precision,
415
- gradient_accumulation_steps=cfg.training.gradient_accumulation_steps,
416
- )
417
-
418
- # === Setup ===
419
- accelerate_set_seed(cfg.seed)
420
-
421
- if cfg.paths.output_dir is None:
422
- cfg.paths.output_dir = HydraConfig.get().runtime.output_dir
423
-
424
- OmegaConf.resolve(cfg)
425
-
426
- log_message(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}", cfg, accelerator)
427
- log_message(f"Number of processes: {accelerator.num_processes}", cfg, accelerator)
428
- log_message(f"Process index: {accelerator.process_index}", cfg, accelerator)
429
- log_message(f"Mixed precision: {mixed_precision}", cfg, accelerator)
430
-
431
- log_message("=" * 60, cfg, accelerator)
432
- log_message("Pythia Training Pipeline (Hydra + Trackio + Accelerate)", cfg, accelerator)
433
- log_message("=" * 60, cfg, accelerator)
434
- log_message(f"Config:\n{OmegaConf.to_yaml(cfg)}", cfg, accelerator)
435
-
436
- # === Trackio Init ===
437
- init_tracking(cfg, accelerator)
438
-
439
- # === Tokenizer ===
440
- log_message("Initializing tokenizer...", cfg, accelerator)
441
- tokenizer = AutoTokenizer.from_pretrained(cfg.model.name)
442
-
443
- if tokenizer.pad_token is None:
444
- tokenizer.pad_token = tokenizer.eos_token
445
- tokenizer.pad_token_id = tokenizer.eos_token_id
446
-
447
- # === Model ===
448
- log_message("Loading model...", cfg, accelerator)
449
-
450
- # Flash Attention 2
451
- torch_dtype = torch.bfloat16 if cfg.training.use_amp else torch.float32
452
-
453
- if cfg.model.checkpoint_path:
454
- model = AutoModelForCausalLM.from_pretrained(
455
- cfg.model.name,
456
- attn_implementation="flash_attention_2",
457
- torch_dtype=torch_dtype,
458
- )
459
- checkpoint = torch.load(cfg.model.checkpoint_path, map_location="cpu")
460
- model.load_state_dict(checkpoint["model_state_dict"] if "model_state_dict" in checkpoint else checkpoint)
461
- log_message(f"Loaded checkpoint: {cfg.model.checkpoint_path}", cfg, accelerator)
462
- elif cfg.model.from_scratch:
463
- config = AutoConfig.from_pretrained(cfg.model.name)
464
- config._attn_implementation = "flash_attention_2"
465
- model = AutoModelForCausalLM.from_config(config, torch_dtype=torch_dtype)
466
- log_message(f"Initialized from scratch: {cfg.model.name}", cfg, accelerator)
467
- else:
468
- model = AutoModelForCausalLM.from_pretrained(
469
- cfg.model.name,
470
- attn_implementation="flash_attention_2",
471
- torch_dtype=torch_dtype,
472
- )
473
- log_message(f"Loaded pretrained: {cfg.model.name}", cfg, accelerator)
474
-
475
- model.train()
476
-
477
- # Log model info
478
- total_params = sum(p.numel() for p in model.parameters())
479
- trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
480
- log_message(f"Total params: {total_params:,}", cfg, accelerator)
481
- log_message(f"Trainable params: {trainable_params:,}", cfg, accelerator)
482
-
483
- # === Data ===
484
- log_message("Creating dataloaders...", cfg, accelerator)
485
- dataloaders = create_dataloaders(cfg, tokenizer)
486
-
487
- train_dataloader = dataloaders["train"]
488
- val_dataloader = dataloaders.get("validation", None)
489
-
490
- log_message(f"Train dataset size: {len(train_dataloader.dataset)}", cfg, accelerator)
491
- log_message(f"Train batches per epoch (before DDP split): {len(train_dataloader)}", cfg, accelerator)
492
-
493
- if val_dataloader:
494
- log_message(f"Validation dataset size: {len(val_dataloader.dataset)}", cfg, accelerator)
495
- log_message(f"Validation batches: {len(val_dataloader)}", cfg, accelerator)
496
- else:
497
- log_message("No validation dataset found", cfg, accelerator)
498
-
499
- # === Optimizer ===
500
- log_message("Creating optimizer...", cfg, accelerator)
501
- param_groups = group_params(model, cfg.training.weight_decay)
502
-
503
- optimizer = torch.optim.AdamW(
504
- param_groups,
505
- lr=cfg.training.lr,
506
- betas=tuple(cfg.training.betas),
507
- eps=cfg.training.eps,
508
- )
509
-
510
- # === Scheduler ===
511
- steps_per_epoch = math.ceil(
512
- len(train_dataloader) / accelerator.num_processes
513
- )
514
- total_steps = (
515
- cfg.training.epochs
516
- * steps_per_epoch
517
- // cfg.training.gradient_accumulation_steps
518
- )
519
- scheduler = get_lr_scheduler(optimizer, cfg, total_steps)
520
-
521
- log_message(
522
- f"Total steps: {total_steps}, Steps per epoch: {steps_per_epoch}",
523
- cfg,
524
- accelerator
525
- )
526
-
527
- # === Accelerate Prepare ===
528
- log_message("Preparing model, optimizer, and dataloaders with Accelerate...", cfg, accelerator)
529
-
530
- if val_dataloader is not None:
531
- model, optimizer, train_dataloader, val_dataloader, scheduler = accelerator.prepare(
532
- model, optimizer, train_dataloader, val_dataloader, scheduler
533
- )
534
- else:
535
- model, optimizer, train_dataloader, scheduler = accelerator.prepare(
536
- model, optimizer, train_dataloader, scheduler
537
- )
538
-
539
- log_message(f"Train batches per epoch (after DDP split): {len(train_dataloader)}", cfg, accelerator)
540
-
541
- # === Resume ===
542
- global_step = 0
543
- start_epoch = 1
544
-
545
- if cfg.training.resume and cfg.training.resume_checkpoint:
546
- global_step, start_epoch = load_checkpoint(
547
- model, optimizer, scheduler, cfg.training.resume_checkpoint, cfg, accelerator
548
- )
549
- start_epoch += 1
550
-
551
- # === Training Loop ===
552
- log_message("Starting training...", cfg, accelerator)
553
-
554
- best_val_loss = float("inf")
555
-
556
- try:
557
- for epoch in range(start_epoch, cfg.training.epochs + 1):
558
- log_message(f"\n{'=' * 60}", cfg, accelerator)
559
- log_message(f"EPOCH {epoch}/{cfg.training.epochs}", cfg, accelerator)
560
- log_message(f"{'=' * 60}", cfg, accelerator)
561
-
562
- global_step, best_val_loss = train_epoch(
563
- model=model,
564
- dataloader=train_dataloader,
565
- optimizer=optimizer,
566
- scheduler=scheduler,
567
- cfg=cfg,
568
- epoch=epoch,
569
- global_step=global_step,
570
- accelerator=accelerator,
571
- val_dataloader=val_dataloader,
572
- best_val_loss=best_val_loss,
573
- )
574
-
575
- if cfg.logging.save_every_epoch:
576
- save_checkpoint(
577
- model, optimizer, scheduler, global_step, epoch, cfg, accelerator
578
- )
579
-
580
- except KeyboardInterrupt:
581
- log_message("Training interrupted by user", cfg, accelerator)
582
- save_checkpoint(model, optimizer, scheduler, global_step, epoch, cfg, accelerator)
583
-
584
- # === Final Save ===
585
- log_message("\nTraining completed!", cfg, accelerator)
586
-
587
- if accelerator.is_main_process:
588
- final_model_path = Path(cfg.paths.output_dir) / "model_final.pt"
589
- unwrapped_model = accelerator.unwrap_model(model)
590
- torch.save(unwrapped_model.state_dict(), final_model_path)
591
- log_message(f"Final model: {final_model_path}", cfg, accelerator)
592
-
593
- accelerator.wait_for_everyone()
594
- finish_tracking()
595
-
596
-
597
- if __name__ == "__main__":
598
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20260418_121916-2mk39j3k/files/config.yaml DELETED
@@ -1,126 +0,0 @@
1
- _wandb:
2
- value:
3
- cli_version: 0.24.0
4
- code_path: code/code_completion_exp/train_pythia/train.py
5
- e:
6
- lxvl8uvlqbraeb0uteef4wc3ipy2fg2z:
7
- codePath: code_completion_exp/train_pythia/train.py
8
- codePathLocal: train.py
9
- cpu_count: 112
10
- cpu_count_logical: 224
11
- cudaVersion: "12.9"
12
- disk:
13
- /:
14
- total: "244813135872"
15
- used: "43314763776"
16
- email: nikita@local.ru
17
- executable: /venv/bytellm/bin/python
18
- git:
19
- commit: ff609fdb5d8f684fdbf9ea6d64d9440c17614af5
20
- remote: https://github.com/naryst/byte-llms-code.git
21
- gpu: NVIDIA H100 80GB HBM3
22
- gpu_count: 2
23
- gpu_nvidia:
24
- - architecture: Hopper
25
- cudaCores: 16896
26
- memoryTotal: "85520809984"
27
- name: NVIDIA H100 80GB HBM3
28
- uuid: GPU-3c87d2f8-c595-49bd-bb1d-1ebfd19c6fb0
29
- - architecture: Hopper
30
- cudaCores: 16896
31
- memoryTotal: "85520809984"
32
- name: NVIDIA H100 80GB HBM3
33
- uuid: GPU-beb9a6b0-ebef-1f4c-d886-465c96f57ca4
34
- host: 3e675e030992
35
- memory:
36
- total: "1622968434688"
37
- os: Linux-5.15.0-173-generic-x86_64-with-glibc2.39
38
- program: /workspace/byte-llms-code/code_completion_exp/train_pythia/train.py
39
- python: CPython 3.12.0
40
- root: outputs/2026-04-18/12-19-14
41
- startedAt: "2026-04-18T12:19:16.549853Z"
42
- writerId: lxvl8uvlqbraeb0uteef4wc3ipy2fg2z
43
- m: []
44
- python_version: 3.12.0
45
- t:
46
- "1":
47
- - 1
48
- - 11
49
- - 49
50
- - 50
51
- - 51
52
- - 71
53
- - 105
54
- "2":
55
- - 1
56
- - 11
57
- - 49
58
- - 50
59
- - 51
60
- - 71
61
- - 105
62
- "3":
63
- - 2
64
- - 13
65
- - 16
66
- - 61
67
- "4": 3.12.0
68
- "5": 0.24.0
69
- "6": 4.57.6
70
- "12": 0.24.0
71
- "13": linux-x86_64
72
- data:
73
- value:
74
- max_context_len: 4096
75
- max_target_len: 256
76
- num_workers: 4
77
- path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
78
- pin_memory: true
79
- device:
80
- value: cuda
81
- logging:
82
- value:
83
- eval_interval: 1000
84
- log_interval: 10
85
- save_every_epoch: true
86
- save_interval: 3000
87
- model:
88
- value:
89
- checkpoint_path: null
90
- from_scratch: false
91
- name: EleutherAI/pythia-1.4b
92
- paths:
93
- value:
94
- output_dir: outputs/2026-04-18/12-19-14
95
- seed:
96
- value: 42
97
- tracking:
98
- value:
99
- backend: wandb
100
- base_url: https://wandb.platun0v.ru
101
- enabled: true
102
- entity: null
103
- local_dir: outputs/2026-04-18/12-19-14
104
- project: code-completion_full
105
- run_name: pythia_1_4b_v4_lr_2e-5
106
- training:
107
- value:
108
- batch_size: 4
109
- betas:
110
- - 0.9
111
- - 0.95
112
- decay_ratio: 0.2
113
- epochs: 3
114
- eps: 1e-08
115
- eval_batch_size: 12
116
- gradient_accumulation_steps: 4
117
- lr: 2e-05
118
- lr_scheduler: wsd
119
- max_grad_norm: 1
120
- min_lr_ratio: 0.1
121
- resume: false
122
- resume_checkpoint: null
123
- use_amp: true
124
- warmup_ratio: 0.1
125
- warmup_steps: 100
126
- weight_decay: 0.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20260418_121916-2mk39j3k/files/output.log DELETED
The diff for this file is too large to render. See raw diff
 
wandb/run-20260418_121916-2mk39j3k/files/requirements.txt DELETED
@@ -1,246 +0,0 @@
1
- setuptools==78.1.1
2
- wheel==0.45.1
3
- pip==25.2
4
- webencodings==0.5.1
5
- triton==3.2.0
6
- pytz==2025.2
7
- pydub==0.25.1
8
- pure_eval==0.2.3
9
- ptyprocess==0.7.0
10
- nvidia-ml-py==13.590.48
11
- nvidia-cusparselt-cu12==0.6.2
12
- mpmath==1.3.0
13
- ipython-genutils==0.2.0
14
- fastjsonschema==2.21.2
15
- brotli==1.2.0
16
- antlr4-python3-runtime==4.9.3
17
- xxhash==3.6.0
18
- widgetsnbextension==4.0.14
19
- websocket-client==1.9.0
20
- webcolors==24.11.1
21
- wcwidth==0.2.14
22
- urllib3==2.5.0
23
- uri-template==1.3.0
24
- tzdata==2025.2
25
- typing_extensions==4.15.0
26
- types-python-dateutil==2.9.0.20251008
27
- traitlets==5.14.3
28
- tqdm==4.67.1
29
- tornado==6.5.2
30
- tomlkit==0.13.3
31
- tinycss2==1.4.0
32
- tabulate==0.9.0
33
- sympy==1.13.1
34
- soupsieve==2.8
35
- sniffio==1.3.1
36
- smmap==5.0.2
37
- six==1.17.0
38
- shellingham==1.5.4
39
- Send2Trash==1.8.3
40
- semantic-version==2.10.0
41
- safetensors==0.6.2
42
- rpds-py==0.27.1
43
- rfc3986-validator==0.1.1
44
- regex==2025.9.18
45
- pyzmq==27.1.0
46
- PyYAML==6.0.3
47
- python-multipart==0.0.22
48
- python-json-logger==4.0.0
49
- python-dotenv==1.2.1
50
- pyparsing==3.2.5
51
- PyJWT==2.8.0
52
- Pygments==2.19.2
53
- pycparser==2.23
54
- pyarrow==22.0.0
55
- psutil==7.1.0
56
- protobuf==6.33.4
57
- propcache==0.4.1
58
- prometheus_client==0.23.1
59
- portalocker==3.2.0
60
- platformdirs==4.5.0
61
- pillow==11.3.0
62
- pexpect==4.9.0
63
- pathspec==1.0.4
64
- parso==0.8.5
65
- pandocfilters==1.5.1
66
- packaging==25.0
67
- orjson==3.11.6
68
- opt_einsum==3.4.0
69
- nvidia-nvtx-cu12==12.4.127
70
- nvidia-nvjitlink-cu12==12.4.127
71
- nvidia-nccl-cu12==2.21.5
72
- nvidia-curand-cu12==10.3.5.147
73
- nvidia-cufile-cu12==1.13.1.3
74
- nvidia-cufft-cu12==11.2.1.3
75
- nvidia-cuda-runtime-cu12==12.4.127
76
- nvidia-cuda-nvrtc-cu12==12.4.127
77
- nvidia-cuda-cupti-cu12==12.4.127
78
- nvidia-cublas-cu12==12.4.5.8
79
- numpy==2.3.3
80
- ninja==1.13.0
81
- networkx==3.5
82
- nest-asyncio==1.6.0
83
- narwhals==2.15.0
84
- mypy_extensions==1.1.0
85
- multidict==6.7.0
86
- mistune==3.1.4
87
- mdurl==0.1.2
88
- MarkupSafe==3.0.3
89
- lxml==6.0.2
90
- librt==0.8.0
91
- lark==1.3.0
92
- kiwisolver==1.4.9
93
- jupyterlab_widgets==3.0.15
94
- jupyterlab_pygments==0.3.0
95
- jsonpointer==3.0.0
96
- json5==0.12.1
97
- itsdangerous==2.2.0
98
- idna==3.10
99
- hf-xet==1.1.10
100
- h11==0.16.0
101
- groovy==0.1.2
102
- fsspec==2025.9.0
103
- frozenlist==1.8.0
104
- fqdn==1.5.1
105
- fonttools==4.60.1
106
- filelock==3.19.1
107
- ffmpy==1.0.0
108
- executing==2.2.1
109
- einops==0.8.1
110
- dill==0.4.0
111
- defusedxml==0.7.1
112
- decorator==5.2.1
113
- debugpy==1.8.17
114
- dacite==1.9.2
115
- cycler==0.12.1
116
- comm==0.2.3
117
- colorama==0.4.6
118
- click==8.3.1
119
- charset-normalizer==3.4.3
120
- certifi==2025.10.5
121
- bleach==6.2.0
122
- babel==2.17.0
123
- attrs==25.4.0
124
- async-lru==2.0.5
125
- asttokens==3.0.0
126
- annotated-types==0.7.0
127
- annotated-doc==0.0.4
128
- aiohappyeyeballs==2.6.1
129
- aiofiles==24.1.0
130
- yarl==1.22.0
131
- uvicorn==0.40.0
132
- typing-inspection==0.4.2
133
- terminado==0.18.1
134
- stack-data==0.6.3
135
- sentry-sdk==2.50.0
136
- scipy==1.17.0
137
- sacrebleu==2.6.0
138
- rfc3987-syntax==1.1.0
139
- rfc3339-validator==0.1.4
140
- requests==2.32.5
141
- reportlab==4.4.9
142
- referencing==0.36.2
143
- python-dateutil==2.9.0.post0
144
- pydantic_core==2.41.5
145
- prompt_toolkit==3.0.52
146
- plotly==6.5.2
147
- pathlib2==2.3.7.post1
148
- orderedmultidict==1.0.2
149
- optree==0.17.0
150
- omegaconf==2.3.0
151
- nvidia-cusparse-cu12==12.3.1.170
152
- nvidia-cudnn-cu12==9.1.0.70
153
- mypy==1.19.1
154
- multiprocess==0.70.16
155
- matplotlib-inline==0.1.7
156
- markdown-it-py==4.0.0
157
- jupyter_core==5.8.1
158
- Jinja2==3.1.6
159
- jedi==0.19.2
160
- ipython_pygments_lexers==1.1.1
161
- httpcore==1.0.9
162
- gitdb==4.0.12
163
- ftfy==6.3.1
164
- contourpy==1.3.3
165
- cffi==2.0.0
166
- beautifulsoup4==4.14.2
167
- anyio==4.11.0
168
- aiosignal==1.4.0
169
- starlette==0.50.0
170
- rich==14.2.0
171
- pydantic==2.12.5
172
- pandas==2.3.3
173
- nvidia-cusolver-cu12==11.6.1.9
174
- matplotlib==3.10.7
175
- jupyter_server_terminals==0.5.3
176
- jupyter_client==8.6.3
177
- jsonschema-specifications==2025.9.1
178
- ipython==9.6.0
179
- hydra-core==1.3.2
180
- huggingface-hub==0.35.3
181
- httpx==0.28.1
182
- GitPython==3.1.46
183
- furl==2.1.4
184
- cryptography==46.0.4
185
- arrow==1.3.0
186
- argon2-cffi-bindings==25.1.0
187
- aiohttp==3.13.1
188
- wandb==0.24.0
189
- typer==0.21.1
190
- torch==2.6.0
191
- tokenizers==0.22.1
192
- seaborn==0.13.2
193
- safehttpx==0.1.7
194
- jsonschema==4.25.1
195
- joypy==0.2.6
196
- isoduration==20.11.0
197
- ipywidgets==8.1.7
198
- ipykernel==6.30.1
199
- gradio_client==2.0.3
200
- fastapi==0.128.0
201
- Authlib==1.6.6
202
- argon2-cffi==25.1.0
203
- transformers==4.57.6
204
- nbformat==5.10.4
205
- mlstm_kernels==2.0.2
206
- jupyter-console==6.6.3
207
- gradio==6.5.1
208
- datasets==4.3.0
209
- clearml==1.16.4
210
- accelerate==1.10.1
211
- xlstm==2.0.4
212
- nbclient==0.10.2
213
- jupyter-events==0.12.0
214
- trackio==0.15.0
215
- nbconvert==7.16.6
216
- jupyter_server==2.17.0
217
- notebook_shim==0.2.4
218
- jupyterlab_server==2.27.3
219
- jupyter-lsp==2.3.0
220
- nbclassic==1.3.3
221
- jupyterlab==4.4.9
222
- notebook==7.4.7
223
- jupyter_contrib_core==0.4.2
224
- jupyter==1.1.1
225
- jupyter_nbextensions_configurator==0.6.4
226
- causal-conv1d==1.5.0.post8
227
- flash_attn==2.7.4.post1
228
- mamba-ssm==2.2.4
229
- hnet==0.0.1
230
- speedtest-cli==2.1.3
231
- autocommand==2.2.2
232
- backports.tarfile==1.2.0
233
- importlib_metadata==8.0.0
234
- inflect==7.3.1
235
- jaraco.collections==5.1.0
236
- jaraco.context==5.3.0
237
- jaraco.functools==4.0.1
238
- jaraco.text==3.12.1
239
- more-itertools==10.3.0
240
- packaging==24.2
241
- platformdirs==4.2.2
242
- tomli==2.0.1
243
- typeguard==4.3.0
244
- typing_extensions==4.12.2
245
- wheel==0.45.1
246
- zipp==3.19.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20260418_121916-2mk39j3k/files/wandb-metadata.json DELETED
@@ -1,47 +0,0 @@
1
- {
2
- "os": "Linux-5.15.0-173-generic-x86_64-with-glibc2.39",
3
- "python": "CPython 3.12.0",
4
- "startedAt": "2026-04-18T12:19:16.549853Z",
5
- "program": "/workspace/byte-llms-code/code_completion_exp/train_pythia/train.py",
6
- "codePath": "code_completion_exp/train_pythia/train.py",
7
- "codePathLocal": "train.py",
8
- "git": {
9
- "remote": "https://github.com/naryst/byte-llms-code.git",
10
- "commit": "ff609fdb5d8f684fdbf9ea6d64d9440c17614af5"
11
- },
12
- "email": "nikita@local.ru",
13
- "root": "outputs/2026-04-18/12-19-14",
14
- "host": "3e675e030992",
15
- "executable": "/venv/bytellm/bin/python",
16
- "cpu_count": 112,
17
- "cpu_count_logical": 224,
18
- "gpu": "NVIDIA H100 80GB HBM3",
19
- "gpu_count": 2,
20
- "disk": {
21
- "/": {
22
- "total": "244813135872",
23
- "used": "43314763776"
24
- }
25
- },
26
- "memory": {
27
- "total": "1622968434688"
28
- },
29
- "gpu_nvidia": [
30
- {
31
- "name": "NVIDIA H100 80GB HBM3",
32
- "memoryTotal": "85520809984",
33
- "cudaCores": 16896,
34
- "architecture": "Hopper",
35
- "uuid": "GPU-3c87d2f8-c595-49bd-bb1d-1ebfd19c6fb0"
36
- },
37
- {
38
- "name": "NVIDIA H100 80GB HBM3",
39
- "memoryTotal": "85520809984",
40
- "cudaCores": 16896,
41
- "architecture": "Hopper",
42
- "uuid": "GPU-beb9a6b0-ebef-1f4c-d886-465c96f57ca4"
43
- }
44
- ],
45
- "cudaVersion": "12.9",
46
- "writerId": "lxvl8uvlqbraeb0uteef4wc3ipy2fg2z"
47
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20260418_121916-2mk39j3k/files/wandb-summary.json DELETED
@@ -1 +0,0 @@
1
- {"train/lr":2.0000000000000003e-06,"train/loss":0.6684185862541199,"val/time":125.04461622238159,"_runtime":14595,"_timestamp":1.7765293278895276e+09,"_step":29660,"epoch/loss":0.7010961592882488,"val/loss":1.0707972391764677,"_wandb":{"runtime":14595},"epoch/time":4867.094700336456,"best/val_perplexity":2.88394085342274,"best/step":19000,"train/loss_avg":0.7010793855010286,"train/step_time":0.357418155670166,"val/perplexity":2.9409693343660885,"best/val_loss":1.0516192755105014,"train/epoch":3}
 
 
wandb/run-20260418_121916-2mk39j3k/logs/debug-core.log DELETED
@@ -1,16 +0,0 @@
1
- {"time":"2026-04-18T12:19:16.63631091Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp8o4e0c4r/port-4105.txt","pid":4105,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
- {"time":"2026-04-18T12:19:16.636897052Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":4105}
3
- {"time":"2026-04-18T12:19:16.636890378Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-4105-4125-1730369648/socket","Net":"unix"}}
4
- {"time":"2026-04-18T12:19:16.824047472Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
- {"time":"2026-04-18T12:19:16.853988313Z","level":"INFO","msg":"handleInformInit: received","streamId":"2mk39j3k","id":"1(@)"}
6
- {"time":"2026-04-18T12:19:17.275867326Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"2mk39j3k","id":"1(@)"}
7
- {"time":"2026-04-18T16:22:36.531964538Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"2mk39j3k","id":"1(@)"}
8
- {"time":"2026-04-18T16:22:36.532983895Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"2mk39j3k","id":"1(@)"}
9
- {"time":"2026-04-18T16:22:36.553203529Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
- {"time":"2026-04-18T16:22:36.553260163Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
- {"time":"2026-04-18T16:22:36.553277979Z","level":"INFO","msg":"server is shutting down"}
12
- {"time":"2026-04-18T16:22:36.553320826Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
13
- {"time":"2026-04-18T16:22:36.553421428Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
14
- {"time":"2026-04-18T16:22:36.553434515Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
15
- {"time":"2026-04-18T16:22:36.553408703Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-4105-4125-1730369648/socket","Net":"unix"}}
16
- {"time":"2026-04-18T16:22:36.553462952Z","level":"INFO","msg":"server is closed"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20260418_121916-2mk39j3k/logs/debug-internal.log DELETED
@@ -1,13 +0,0 @@
1
- {"time":"2026-04-18T12:19:16.854153102Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
- {"time":"2026-04-18T12:19:17.275622627Z","level":"INFO","msg":"stream: created new stream","id":"2mk39j3k"}
3
- {"time":"2026-04-18T12:19:17.275728468Z","level":"INFO","msg":"handler: started","stream_id":"2mk39j3k"}
4
- {"time":"2026-04-18T12:19:17.27585918Z","level":"INFO","msg":"stream: started","id":"2mk39j3k"}
5
- {"time":"2026-04-18T12:19:17.275907737Z","level":"INFO","msg":"writer: started","stream_id":"2mk39j3k"}
6
- {"time":"2026-04-18T12:19:17.275922617Z","level":"INFO","msg":"sender: started","stream_id":"2mk39j3k"}
7
- {"time":"2026-04-18T12:19:17.416506096Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
- {"time":"2026-04-18T16:22:36.413732384Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
- {"time":"2026-04-18T16:22:36.528364306Z","level":"INFO","msg":"handler: operation stats","stats":{}}
10
- {"time":"2026-04-18T16:22:36.53202978Z","level":"INFO","msg":"stream: closing","id":"2mk39j3k"}
11
- {"time":"2026-04-18T16:22:36.532057065Z","level":"INFO","msg":"handler: closed","stream_id":"2mk39j3k"}
12
- {"time":"2026-04-18T16:22:36.532311468Z","level":"INFO","msg":"sender: closed","stream_id":"2mk39j3k"}
13
- {"time":"2026-04-18T16:22:36.532328723Z","level":"INFO","msg":"stream: closed","id":"2mk39j3k"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20260418_121916-2mk39j3k/logs/debug.log DELETED
@@ -1,24 +0,0 @@
1
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_setup.py:_flush():81] Configure stats pid to 4105
3
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_init.py:setup_run_log_directory():717] Logging user logs to outputs/2026-04-18/12-19-14/wandb/run-20260418_121916-2mk39j3k/logs/debug.log
5
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to outputs/2026-04-18/12-19-14/wandb/run-20260418_121916-2mk39j3k/logs/debug-internal.log
6
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_init.py:init():844] calling init triggers
7
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
- config: {'model': {'name': 'EleutherAI/pythia-1.4b', 'checkpoint_path': None, 'from_scratch': False}, 'training': {'epochs': 3, 'batch_size': 4, 'eval_batch_size': 12, 'gradient_accumulation_steps': 4, 'lr': 2e-05, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 4, 'pin_memory': True}, 'logging': {'log_interval': 10, 'save_interval': 3000, 'eval_interval': 1000, 'save_every_epoch': True}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'code-completion_full', 'run_name': 'pythia_1_4b_v4_lr_2e-5', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': 'outputs/2026-04-18/12-19-14'}, 'paths': {'output_dir': 'outputs/2026-04-18/12-19-14'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/code_completion_exp/train_pythia/train.py'}}
9
- 2026-04-18 12:19:16,551 INFO MainThread:4105 [wandb_init.py:init():892] starting backend
10
- 2026-04-18 12:19:16,824 INFO MainThread:4105 [wandb_init.py:init():895] sending inform_init request
11
- 2026-04-18 12:19:16,852 INFO MainThread:4105 [wandb_init.py:init():903] backend started and connected
12
- 2026-04-18 12:19:16,858 INFO MainThread:4105 [wandb_init.py:init():973] updated telemetry
13
- 2026-04-18 12:19:16,890 INFO MainThread:4105 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
- 2026-04-18 12:19:17,414 INFO MainThread:4105 [wandb_init.py:init():1044] starting run threads in backend
15
- 2026-04-18 12:19:17,567 INFO MainThread:4105 [wandb_run.py:_console_start():2529] atexit reg
16
- 2026-04-18 12:19:17,568 INFO MainThread:4105 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
- 2026-04-18 12:19:17,568 INFO MainThread:4105 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
- 2026-04-18 12:19:17,568 INFO MainThread:4105 [wandb_run.py:_redirect():2469] Redirects installed.
19
- 2026-04-18 12:19:17,571 INFO MainThread:4105 [wandb_init.py:init():1084] run started, returning control to user process
20
- 2026-04-18 16:22:34,834 INFO MainThread:4105 [wandb_run.py:_finish():2295] finishing run nikita/code-completion_full/2mk39j3k
21
- 2026-04-18 16:22:34,835 INFO MainThread:4105 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
- 2026-04-18 16:22:34,835 INFO MainThread:4105 [wandb_run.py:_restore():2476] restore
23
- 2026-04-18 16:22:34,835 INFO MainThread:4105 [wandb_run.py:_restore():2482] restore done
24
- 2026-04-18 16:22:36,531 INFO MainThread:4105 [wandb_run.py:_footer_sync_info():3870] logging synced files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/run-20260418_121916-2mk39j3k/run-2mk39j3k.wandb DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9aa82c70ac11e1e1ec7ed8e2f4354f65bceddca985f5a275dd53cffb1973aae
3
- size 4312629