|
|
|
|
| from transformers import (
|
| AutoTokenizer,
|
| AutoModelForCausalLM,
|
| TrainingArguments,
|
| Trainer,
|
| BitsAndBytesConfig,
|
| DataCollatorForSeq2Seq
|
| )
|
| from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
| from datasets import load_dataset
|
| import torch
|
| import os
|
| import wandb
|
|
|
| os.environ["WANDB_PROJECT"] = "codellama-7b-instruct-qlora-linux-bugfix"
|
| os.environ["WANDB_NAME"] = "run-v1"
|
|
|
| BASE_MODEL = "codellama/CodeLLaMA-7b-Instruct-hf"
|
| DATA_PATH = "../dataset/linux_bugfix_100k.jsonl"
|
| OUTPUT_DIR = "./output/qlora-codellama-bugfix"
|
|
|
|
|
| dataset = load_dataset("json", data_files=DATA_PATH, split="train")
|
|
|
|
|
| bnb_config = BitsAndBytesConfig(
|
| load_in_4bit=True,
|
| bnb_4bit_use_double_quant=True,
|
| bnb_4bit_quant_type="nf4",
|
| bnb_4bit_compute_dtype=torch.bfloat16
|
| )
|
|
|
|
|
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
|
| tokenizer.pad_token = tokenizer.eos_token
|
| tokenizer.padding_side = "right"
|
|
|
| model = AutoModelForCausalLM.from_pretrained(
|
| BASE_MODEL,
|
| quantization_config=bnb_config,
|
| device_map="auto"
|
| )
|
| model = prepare_model_for_kbit_training(model)
|
| model.gradient_checkpointing_enable()
|
| torch.backends.cuda.matmul.allow_tf32 = True
|
|
|
|
|
| lora_config = LoraConfig(
|
| r=64,
|
| lora_alpha=16,
|
| lora_dropout=0.1,
|
| bias="none",
|
| task_type="CAUSAL_LM"
|
| )
|
| model = get_peft_model(model, lora_config)
|
| model.config.use_cache = False
|
| model.config.return_dict = True
|
| model.config.pad_token_id = tokenizer.pad_token_id
|
| model.print_trainable_parameters()
|
|
|
|
|
| model_max_len = tokenizer.model_max_length
|
|
|
| def format(example):
|
| prompt_ids = tokenizer(example["prompt"], truncation=True, max_length=1024)["input_ids"]
|
| completion_ids = tokenizer(example["completion"], truncation=True, max_length=512)["input_ids"]
|
|
|
| input_ids = prompt_ids + completion_ids
|
| labels = [-100] * len(prompt_ids) + completion_ids
|
|
|
|
|
| max_len = min(len(input_ids), tokenizer.model_max_length)
|
| input_ids = input_ids[:max_len]
|
| labels = labels[:max_len]
|
|
|
| return {
|
| "input_ids": input_ids,
|
| "labels": labels,
|
| }
|
|
|
|
|
|
|
| print("__ Sanity checking one example...")
|
| sample = format(dataset[0])
|
| test_input = torch.tensor(sample["input_ids"]).unsqueeze(0).to(model.device)
|
| test_labels = torch.tensor(sample["labels"]).unsqueeze(0).to(model.device)
|
| model.train()
|
| out = model(input_ids=test_input, labels=test_labels)
|
| assert out.loss.requires_grad, "Sanity check failed: Loss does not require grad."
|
| print("__ Sanity check passed. Proceeding to map()...")
|
|
|
|
|
| dataset = dataset.map(format, remove_columns=["prompt", "completion"])
|
| collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="pt", pad_to_multiple_of=8)
|
|
|
|
|
| training_args = TrainingArguments(
|
| report_to="wandb",
|
| run_name="codellama-7b-instruct-qlora-linux-bugfix",
|
| logging_dir=f"{OUTPUT_DIR}/logs",
|
|
|
| output_dir=OUTPUT_DIR,
|
| num_train_epochs=3,
|
| per_device_train_batch_size=64,
|
| gradient_accumulation_steps=4,
|
| learning_rate=2e-4,
|
| lr_scheduler_type="cosine",
|
| warmup_ratio=0.03,
|
| gradient_checkpointing=True,
|
| bf16=True,
|
| fp16=False,
|
| max_grad_norm=1.0,
|
| save_strategy="steps",
|
| save_steps=500,
|
| save_total_limit=2,
|
| logging_steps=50,
|
| push_to_hub=False,
|
| label_names=["labels"],
|
| remove_unused_columns=False,
|
| )
|
|
|
|
|
| trainer = Trainer(
|
| model=model,
|
| args=training_args,
|
| train_dataset=dataset,
|
| tokenizer=tokenizer,
|
| data_collator=collator
|
| )
|
|
|
|
|
|
|
| model.train()
|
| print(f"Track this run in Weights & Biases: https://wandb.ai/{os.environ['WANDB_PROJECT']}/{os.environ['WANDB_NAME']}")
|
| trainer.train(resume_from_checkpoint=True)
|
|
|
|
|
|
|
| model.save_pretrained(OUTPUT_DIR, safe_serialization=True)
|
| tokenizer.save_pretrained(OUTPUT_DIR)
|
| print(f"[DONE] Model saved to {OUTPUT_DIR}")
|
|
|