| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """ |
| | Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa). |
| | GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned |
| | using a masked language modeling (MLM) loss. |
| | """ |
| |
|
| |
|
| | import logging |
| | import math |
| | import os |
| | from dataclasses import dataclass, field |
| | from typing import Optional |
| | import torch |
| |
|
| | from transformers import ( |
| | MODEL_WITH_LM_HEAD_MAPPING, |
| | AutoTokenizer, |
| | HfArgumentParser, |
| | PreTrainedTokenizer, |
| | set_seed, |
| | ) |
| | from relogic.pretrainkit.models.sql_to_text import SQL2TextModel |
| | from relogic.pretrainkit.trainer import Trainer |
| | from pretrainkit.datasets.text_generation.sql_to_text import SQL2QueryDataset, DataCollatorForSQL2Query |
| | from relogic.pretrainkit.scorers.text_generation import TextGenerationScorer |
| | from relogic.pretrainkit.training_args import TrainingArguments |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys()) |
| | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) |
| |
|
| |
|
| | @dataclass |
| | class ModelArguments: |
| | """ |
| | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. |
| | """ |
| |
|
| | model_name_or_path: Optional[str] = field( |
| | default=None, |
| | metadata={ |
| | "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch." |
| | }, |
| | ) |
| | model_type: Optional[str] = field( |
| | default=None, |
| | metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, |
| | ) |
| | config_name: Optional[str] = field( |
| | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} |
| | ) |
| | tokenizer_name: Optional[str] = field( |
| | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} |
| | ) |
| | cache_dir: Optional[str] = field( |
| | default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} |
| | ) |
| |
|
| |
|
| | @dataclass |
| | class DataTrainingArguments: |
| | """ |
| | Arguments pertaining to what data we are going to input our model for training and eval. |
| | """ |
| |
|
| | train_data_file: Optional[str] = field( |
| | default=None, metadata={"help": "The input training data file (a text file)."} |
| | ) |
| | eval_data_file: Optional[str] = field( |
| | default=None, |
| | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, |
| | ) |
| | line_by_line: bool = field( |
| | default=False, |
| | metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."}, |
| | ) |
| |
|
| | mlm: bool = field( |
| | default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."} |
| | ) |
| | mlm_probability: float = field( |
| | default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"} |
| | ) |
| |
|
| | block_size: int = field( |
| | default=-1, |
| | metadata={ |
| | "help": "Optional input sequence length after tokenization." |
| | "The training dataset will be truncated in block of this size for training." |
| | "Default to the model max input length for single sentence inputs (take into account special tokens)." |
| | }, |
| | ) |
| | overwrite_cache: bool = field( |
| | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} |
| | ) |
| | |
| | |
| | |
| |
|
| |
|
| | def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False): |
| | file_path = args.eval_data_file if evaluate else args.train_data_file |
| | return SQL2QueryDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size) |
| |
|
| |
|
| | def main(): |
| | |
| | |
| | |
| |
|
| | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) |
| | model_args, data_args, training_args = parser.parse_args_into_dataclasses() |
| |
|
| | if data_args.eval_data_file is None and training_args.do_eval: |
| | raise ValueError( |
| | "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " |
| | "or remove the --do_eval argument." |
| | ) |
| |
|
| | if ( |
| | os.path.exists(training_args.output_dir) |
| | and os.listdir(training_args.output_dir) |
| | and training_args.do_train |
| | and not training_args.overwrite_output_dir |
| | ): |
| | raise ValueError( |
| | f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." |
| | ) |
| |
|
| | |
| | logging.basicConfig( |
| | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", |
| | datefmt="%m/%d/%Y %H:%M:%S", |
| | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, |
| | ) |
| | logger.warning( |
| | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", |
| | training_args.local_rank, |
| | training_args.device, |
| | training_args.n_gpu, |
| | bool(training_args.local_rank != -1), |
| | training_args.fp16, |
| | ) |
| | logger.info("Training/evaluation parameters %s", training_args) |
| |
|
| | |
| | set_seed(training_args.seed) |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | """Initialize models and tokenizer""" |
| | if model_args.tokenizer_name: |
| | tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) |
| | elif model_args.model_name_or_path: |
| | tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) |
| | else: |
| | raise ValueError( |
| | "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," |
| | "and load it from here, using --tokenizer_name" |
| | ) |
| |
|
| | model = SQL2TextModel() |
| | if training_args.do_eval and not training_args.do_train: |
| | model_param = torch.load(os.path.join(model_args.model_name_or_path, "pytorch_model.bin")) |
| | model.load_state_dict(model_param) |
| | print("All key matched and load successfully.") |
| |
|
| |
|
| | if data_args.block_size <= 0: |
| | data_args.block_size = tokenizer.max_len |
| | |
| | else: |
| | data_args.block_size = min(data_args.block_size, tokenizer.max_len) |
| |
|
| | |
| |
|
| | train_dataset = get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None |
| | eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None |
| | |
| | |
| | |
| | data_collator = DataCollatorForSQL2Query(tokenizer=tokenizer) |
| |
|
| | label_bos_id = data_collator.label_bos_id |
| | label_eos_id = data_collator.label_eos_id |
| |
|
| | match_sequence_scorer = TextGenerationScorer(bos_id=label_bos_id, eos_id=label_eos_id, tokenizer=tokenizer, output_path=os.path.join(training_args.output_dir, "eval_dump.json")) |
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | data_collator=data_collator, |
| | train_dataset=train_dataset, |
| | eval_dataset=eval_dataset, |
| | compute_metrics=match_sequence_scorer |
| | ) |
| |
|
| | |
| | if training_args.do_train: |
| | model_path = ( |
| | model_args.model_name_or_path |
| | if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) |
| | else None |
| | ) |
| | trainer.train(model_path=model_path) |
| | trainer.save_model() |
| | |
| | |
| | if trainer.is_world_master(): |
| | tokenizer.save_pretrained(training_args.output_dir) |
| |
|
| | |
| | results = {} |
| | if training_args.do_eval: |
| | logger.info("*** Evaluate ***") |
| |
|
| | eval_output = trainer.evaluate() |
| |
|
| | perplexity = math.exp(eval_output["eval_loss"]) |
| | result = {"perplexity": perplexity} |
| |
|
| | output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") |
| | if trainer.is_world_master(): |
| | with open(output_eval_file, "w") as writer: |
| | logger.info("***** Eval results *****") |
| | for key in sorted(result.keys()): |
| | logger.info(" %s = %s", key, str(result[key])) |
| | writer.write("%s = %s\n" % (key, str(result[key]))) |
| |
|
| | results.update(result) |
| |
|
| | return results |
| |
|
| |
|
| | def _mp_fn(index): |
| | |
| | main() |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|