| | import dataclasses |
| | import json |
| | import logging |
| | import os |
| | from dataclasses import dataclass, field |
| | from typing import Any, Dict, Optional, Tuple |
| |
|
| | from transformers.file_utils import cached_property, is_torch_available, is_torch_tpu_available, torch_required |
| |
|
| |
|
| | if is_torch_available(): |
| | import torch |
| |
|
| | if is_torch_tpu_available(): |
| | import torch_xla.core.xla_model as xm |
| |
|
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def default_logdir() -> str: |
| | """ |
| | Same default as PyTorch |
| | """ |
| | import socket |
| | from datetime import datetime |
| |
|
| | current_time = datetime.now().strftime("%b%d_%H-%M-%S") |
| | return os.path.join("runs", current_time + "_" + socket.gethostname()) |
| |
|
| |
|
| | @dataclass |
| | class TrainingArguments: |
| | """ |
| | TrainingArguments is the subset of the arguments we use in our example scripts |
| | **which relate to the training loop itself**. |
| | |
| | Using `HfArgumentParser` we can turn this class |
| | into argparse arguments to be able to specify them on |
| | the command line. |
| | """ |
| |
|
| | output_dir: str = field( |
| | metadata={"help": "The output directory where the model predictions and checkpoints will be written."} |
| | ) |
| | overwrite_output_dir: bool = field( |
| | default=False, |
| | metadata={ |
| | "help": ( |
| | "Overwrite the content of the output directory." |
| | "Use this to continue training if output_dir points to a checkpoint directory." |
| | ) |
| | }, |
| | ) |
| |
|
| | do_train: bool = field(default=False, metadata={"help": "Whether to run training."}) |
| | do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."}) |
| | do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."}) |
| | evaluate_during_training: bool = field( |
| | default=False, metadata={"help": "Run evaluation during training at each logging step."}, |
| | ) |
| |
|
| | per_device_train_batch_size: int = field( |
| | default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."} |
| | ) |
| | per_device_eval_batch_size: int = field( |
| | default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."} |
| | ) |
| |
|
| | per_gpu_train_batch_size: Optional[int] = field( |
| | default=None, |
| | metadata={ |
| | "help": "Deprecated, the use of `--per_device_train_batch_size` is preferred. " |
| | "Batch size per GPU/TPU core/CPU for training." |
| | }, |
| | ) |
| | per_gpu_eval_batch_size: Optional[int] = field( |
| | default=None, |
| | metadata={ |
| | "help": "Deprecated, the use of `--per_device_eval_batch_size` is preferred." |
| | "Batch size per GPU/TPU core/CPU for evaluation." |
| | }, |
| | ) |
| |
|
| | gradient_accumulation_steps: int = field( |
| | default=1, |
| | metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."}, |
| | ) |
| |
|
| | learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for Adam."}) |
| | weight_decay: float = field(default=0.0, metadata={"help": "Weight decay if we apply some."}) |
| | adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for Adam optimizer."}) |
| | max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."}) |
| |
|
| | num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."}) |
| | max_steps: int = field( |
| | default=-1, |
| | metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."}, |
| | ) |
| | warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."}) |
| |
|
| | logging_tqdm: bool = field(default=False, metadata={"help": "Show tqdm or not."}) |
| | eval_steps: int = field(default=500, metadata={"help": "Run validation every X updates steps."}) |
| |
|
| | logging_dir: Optional[str] = field(default_factory=default_logdir, metadata={"help": "Tensorboard log dir."}) |
| | logging_first_step: bool = field(default=False, metadata={"help": "Log and eval the first global_step"}) |
| | logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."}) |
| | save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."}) |
| | save_total_limit: Optional[int] = field( |
| | default=None, |
| | metadata={ |
| | "help": ( |
| | "Limit the total amount of checkpoints." |
| | "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints" |
| | ) |
| | }, |
| | ) |
| | no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"}) |
| | seed: int = field(default=42, metadata={"help": "random seed for initialization"}) |
| |
|
| | fp16: bool = field( |
| | default=False, |
| | metadata={"help": "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"}, |
| | ) |
| | fp16_opt_level: str = field( |
| | default="O1", |
| | metadata={ |
| | "help": ( |
| | "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." |
| | "See details at https://nvidia.github.io/apex/amp.html" |
| | ) |
| | }, |
| | ) |
| | local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"}) |
| |
|
| | tpu_num_cores: Optional[int] = field( |
| | default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"} |
| | ) |
| | tpu_metrics_debug: bool = field(default=False, metadata={"help": "TPU: Whether to print debug metrics"}) |
| |
|
| | dataloader_drop_last: bool = field( |
| | default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."} |
| | ) |
| |
|
| | @property |
| | def train_batch_size(self) -> int: |
| | if self.per_gpu_train_batch_size: |
| | logger.warning( |
| | "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future " |
| | "version. Using `--per_device_train_batch_size` is preferred." |
| | ) |
| | per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size |
| | return per_device_batch_size * max(1, self.n_gpu) |
| |
|
| | @property |
| | def eval_batch_size(self) -> int: |
| | if self.per_gpu_eval_batch_size: |
| | logger.warning( |
| | "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future " |
| | "version. Using `--per_device_eval_batch_size` is preferred." |
| | ) |
| | per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size |
| | return per_device_batch_size * max(1, self.n_gpu) |
| |
|
| | @cached_property |
| | @torch_required |
| | def _setup_devices(self) -> Tuple["torch.device", int]: |
| | logger.info("PyTorch: setting up devices") |
| | if self.no_cuda: |
| | device = torch.device("cpu") |
| | n_gpu = 0 |
| | elif is_torch_tpu_available(): |
| | device = xm.xla_device() |
| | n_gpu = 0 |
| | elif self.local_rank == -1: |
| | |
| | |
| | |
| | |
| | |
| | |
| | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") |
| | n_gpu = torch.cuda.device_count() |
| | else: |
| | |
| | |
| | torch.distributed.init_process_group(backend="nccl") |
| | device = torch.device("cuda", self.local_rank) |
| | n_gpu = 1 |
| |
|
| | if device.type == "cuda": |
| | torch.cuda.set_device(device) |
| |
|
| | return device, n_gpu |
| |
|
| | @property |
| | @torch_required |
| | def device(self) -> "torch.device": |
| | return self._setup_devices[0] |
| |
|
| | @property |
| | @torch_required |
| | def n_gpu(self): |
| | return self._setup_devices[1] |
| |
|
| | def to_json_string(self): |
| | """ |
| | Serializes this instance to a JSON string. |
| | """ |
| | return json.dumps(dataclasses.asdict(self), indent=2) |
| |
|
| | def to_sanitized_dict(self) -> Dict[str, Any]: |
| | """ |
| | Sanitized serialization to use with TensorBoard’s hparams |
| | """ |
| | d = dataclasses.asdict(self) |
| | valid_types = [bool, int, float, str] |
| | if is_torch_available(): |
| | valid_types.append(torch.Tensor) |
| | return {k: v if type(v) in valid_types else str(v) for k, v in d.items()} |
| |
|