| | |
| |
|
| |
|
| | |
| | |
| | |
| | |
| |
|
| |
|
| |
|
| | |
| | |
| |
|
| | |
| |
|
| |
|
| |
|
| | |
| |
|
| | |
| |
|
| |
|
| |
|
| | |
| |
|
| | |
| |
|
| | from datasets import load_dataset |
| | |
| | Falcon = load_dataset('csv', data_files={"train": 'FalconData_train2.csv', "validation": 'FalconData_validation2.csv'}) |
| |
|
| | print('Dataset Loaded!') |
| |
|
| | |
| |
|
| | """Then take a look at an example:""" |
| |
|
| | Falcon['train'][0] |
| |
|
| | Falcon['validation'][0] |
| |
|
| |
|
| |
|
| | |
| | |
| |
|
| | |
| |
|
| | """The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:""" |
| |
|
| | from transformers import AutoTokenizer, GPT2TokenizerFast |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained("distilgpt2") |
| |
|
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | tokenizer.pad_token = tokenizer.eos_token |
| |
|
| | Falcon = Falcon.flatten() |
| | Falcon["train"][0] |
| |
|
| |
|
| |
|
| | def preprocess_function(examples): |
| | return tokenizer([" ".join(x) for x in examples["Text"]]) |
| |
|
| |
|
| |
|
| | tokenized_Falcon = Falcon.map( |
| | preprocess_function, |
| | batched=True, |
| | num_proc=4, |
| | remove_columns=Falcon["train"].column_names, |
| | ) |
| |
|
| |
|
| | block_size = tokenizer.model_max_length |
| | |
| |
|
| |
|
| | def group_texts(examples): |
| | |
| | concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} |
| | total_length = len(concatenated_examples[list(examples.keys())[0]]) |
| | |
| | |
| | if total_length >= block_size: |
| | total_length = (total_length // block_size) * block_size |
| | |
| | result = { |
| | k: [t[i : i + block_size] for i in range(0, total_length, block_size)] |
| | for k, t in concatenated_examples.items() |
| | } |
| | result["labels"] = result["input_ids"].copy() |
| | return result |
| |
|
| | """Apply the `group_texts` function over the entire dataset:""" |
| |
|
| | lm_dataset = tokenized_Falcon.map(group_texts, batched=True, num_proc=4) |
| |
|
| |
|
| |
|
| | from transformers import DataCollatorForLanguageModeling |
| |
|
| | |
| | |
| | |
| |
|
| | data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) |
| |
|
| |
|
| |
|
| | from transformers import AutoModelForCausalLM, TrainingArguments, Trainer |
| | import torch |
| | model = AutoModelForCausalLM.from_pretrained("rwh/tiny8", torch_dtype=torch.bfloat16) |
| |
|
| | print('Model Loaded!') |
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| |
|
| | |
| |
|
| | model.to('cuda') |
| |
|
| | OutputDir = "C1ReadyModel" |
| |
|
| | training_args = TrainingArguments( |
| | output_dir=OutputDir, |
| | overwrite_output_dir=True, |
| | bf16=True, |
| | |
| | evaluation_strategy="steps", |
| | |
| | |
| | learning_rate=1e-5, |
| | weight_decay=0.01, |
| | |
| | num_train_epochs=6, |
| | per_device_train_batch_size=8, |
| | per_device_eval_batch_size=8, |
| | |
| | lr_scheduler_type = 'linear', |
| | push_to_hub=False, |
| | save_total_limit = 2, |
| | save_strategy = "steps", |
| | load_best_model_at_end=True, |
| | save_safetensors=True, |
| | ) |
| |
|
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=lm_dataset["train"], |
| | eval_dataset=lm_dataset["validation"], |
| | |
| | data_collator=data_collator, |
| | ) |
| |
|
| | |
| | print('Started Training!') |
| | trainer.train() |
| |
|
| | trainer.save_model(OutputDir) |
| | print('Saved Model Path:', OutputDir) |
| |
|
| | import math |
| |
|
| | eval_results = trainer.evaluate() |
| | print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") |
| |
|
| |
|
| |
|