| | |
| |
|
| | from __future__ import print_function |
| | import json |
| | import os |
| | import struct |
| | import sys |
| | import platform |
| | import re |
| | import time |
| | import traceback |
| | import requests |
| | import socket |
| | import random |
| | import math |
| | import numpy as np |
| | import torch |
| | import logging |
| | import datetime |
| | from torch.optim.lr_scheduler import _LRScheduler |
| | from torch import nn |
| | import torch.nn.functional as F |
| | from torch.nn.modules.loss import _WeightedLoss |
| |
|
| |
|
| |
|
| | def seed_all(seed_value, cuda_deterministic=False): |
| | """ |
| | 设置所有的随机种子 |
| | """ |
| | random.seed(seed_value) |
| | os.environ['PYTHONHASHSEED'] = str(seed_value) |
| | np.random.seed(seed_value) |
| | torch.manual_seed(seed_value) |
| | if torch.cuda.is_available(): |
| | torch.cuda.manual_seed(seed_value) |
| | torch.cuda.manual_seed_all(seed_value) |
| | |
| | if cuda_deterministic: |
| | torch.backends.cudnn.deterministic = True |
| | torch.backends.cudnn.benchmark = False |
| | else: |
| | torch.backends.cudnn.deterministic = False |
| | torch.backends.cudnn.benchmark = True |
| |
|
| |
|
| | def set_log(logfileName, rank=-1): |
| | """ |
| | master节点保存所有log,其他节点只保存warning及error |
| | """ |
| | log_file_folder = os.path.dirname(logfileName) |
| | time_now = datetime.datetime.now() |
| | logfileName = f'{logfileName}_{time_now.year}_{time_now.month}_{time_now.day}_{time_now.hour}_{time_now.minute}.log' |
| | if not os.path.exists(log_file_folder): |
| | os.makedirs(log_file_folder) |
| | else: |
| | pass |
| |
|
| | logging.basicConfig(level=logging.INFO if rank in [-1, 0] else logging.WARN, |
| | format='[%(asctime)s %(levelname)s %(filename)s line %(lineno)d %(process)d] %(message)s', |
| | datefmt='[%X]', |
| | handlers=[logging.FileHandler(logfileName), logging.StreamHandler()] |
| | ) |
| | logger = logging.getLogger() |
| | return logger |
| |
|
| |
|
| | def save_ckpt(epoch, model, optimizer, scheduler, losses, model_name, ckpt_folder): |
| | """ |
| | 保存模型checkpoint |
| | """ |
| | if not os.path.exists(ckpt_folder): |
| | os.makedirs(ckpt_folder) |
| | torch.save( |
| | { |
| | 'epoch': epoch, |
| | 'model_state_dict': model.module.state_dict(), |
| | 'optimizer_state_dict': optimizer.state_dict(), |
| | 'scheduler_state_dict': scheduler.state_dict(), |
| | 'losses': losses, |
| | }, |
| | f'{ckpt_folder}{model_name}_{epoch}.pth' |
| | ) |
| |
|
| | def save_simple_ckpt(model, model_name, ckpt_folder): |
| | """ |
| | 保存模型checkpoint |
| | """ |
| | if not os.path.exists(ckpt_folder): |
| | os.makedirs(ckpt_folder) |
| | torch.save( |
| | { |
| | 'model_state_dict': model.module.state_dict() |
| | }, |
| | f'{ckpt_folder}{model_name}.pth' |
| | ) |
| |
|
| | def save_best_ckpt(epoch, model, optimizer, scheduler, losses, model_name, ckpt_folder): |
| | """ |
| | 保存模型checkpoint |
| | """ |
| | if not os.path.exists(ckpt_folder): |
| | os.makedirs(ckpt_folder) |
| | torch.save( |
| | { |
| | 'epoch': epoch, |
| | 'model_state_dict': model.module.state_dict(), |
| | 'optimizer_state_dict': optimizer.state_dict(), |
| | 'scheduler_state_dict': scheduler.state_dict(), |
| | 'losses': losses, |
| | }, |
| | f'{ckpt_folder}{model_name}_best.pth' |
| | ) |
| |
|
| | def get_reduced(tensor, current_device, dest_device, world_size): |
| | """ |
| | 将不同GPU上的变量或tensor集中在主GPU上,并得到均值 |
| | """ |
| | tensor = tensor.clone().detach() if torch.is_tensor(tensor) else torch.tensor(tensor) |
| | tensor = tensor.to(current_device) |
| | torch.distributed.reduce(tensor, dst=dest_device) |
| | tensor_mean = tensor.item() / world_size |
| | return tensor_mean |
| |
|
| | def get_ndtensor_reduced(tensor, current_device, dest_device, world_size): |
| | """ |
| | 将不同GPU上的变量或tensor集中在主GPU上,并得到均值, 需要是2维张量 |
| | """ |
| | tensor = tensor.clone().detach() if torch.is_tensor(tensor) else torch.tensor(tensor) |
| | tensor = tensor.to(current_device) |
| | torch.distributed.reduce(tensor, dst=dest_device) |
| | tensor_mean = torch.zeros(tensor.shape) |
| | if len(tensor.shape) == 2: |
| | for i in range(tensor.shape[0]): |
| | for j in range(tensor.shape[1]): |
| | tensor_mean[i,j] = tensor[i,j].item() / world_size |
| | elif len(tensor.shape) == 1: |
| | for i in range(tensor.shape[0]): |
| | tensor_mean[i] = tensor[i].item() / world_size |
| | return tensor_mean |
| |
|
| | def numel(m: torch.nn.Module, only_trainable: bool = False): |
| | """ |
| | returns the total number of parameters used by `m` (only counting |
| | shared parameters once); if `only_trainable` is True, then only |
| | includes parameters with `requires_grad = True` |
| | """ |
| | parameters = m.parameters() |
| | if only_trainable: |
| | parameters = list(p for p in parameters if p.requires_grad) |
| | unique = dict((p.data_ptr(), p) for p in parameters).values() |
| | return sum(p.numel() for p in unique) |
| |
|
| |
|
| | def label_smooth(y, K, epsilon=0.1): |
| | """ |
| | Label smoothing for multiclass labels |
| | One hot encode labels `y` over `K` classes. `y` should be of the form [1, 6, 3, etc.] |
| | """ |
| | m = len(y) |
| | out = np.ones((m, K)) * epsilon / K |
| | for index in range(m): |
| | out[index][y[index] - 1] += 1 - epsilon |
| | return torch.tensor(out) |
| |
|
| |
|
| | class SequentialDistributedSampler(torch.utils.data.sampler.Sampler): |
| | """ |
| | Distributed Sampler that subsamples indicies sequentially, |
| | making it easier to collate all results at the end. |
| | Even though we only use this sampler for eval and predict (no training), |
| | which means that the model params won't have to be synced (i.e. will not hang |
| | for synchronization even if varied number of forward passes), we still add extra |
| | samples to the sampler to make it evenly divisible (like in `DistributedSampler`) |
| | to make it easy to `gather` or `reduce` resulting tensors at the end of the loop. |
| | """ |
| |
|
| | def __init__(self, dataset, batch_size, world_size, rank=None, num_replicas=None): |
| | if num_replicas is None: |
| | if not torch.distributed.is_available(): |
| | raise RuntimeError("Requires distributed package to be available") |
| | num_replicas = world_size |
| | if rank is None: |
| | if not torch.distributed.is_available(): |
| | raise RuntimeError("Requires distributed package to be available") |
| | rank = torch.distributed.get_rank() |
| | self.dataset = dataset |
| | self.num_replicas = num_replicas |
| | self.rank = rank |
| | self.batch_size = batch_size |
| | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.batch_size / self.num_replicas)) * self.batch_size |
| | self.total_size = self.num_samples * self.num_replicas |
| |
|
| | def __iter__(self): |
| | indices = list(range(len(self.dataset))) |
| | |
| | indices += [indices[-1]] * (self.total_size - len(indices)) |
| | |
| | indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples] |
| | return iter(indices) |
| |
|
| | def __len__(self): |
| | return self.num_samples |
| |
|
| |
|
| | def distributed_concat(tensor, num_total_examples, world_size): |
| | """ |
| | 合并不同进程的inference结果 |
| | """ |
| | output_tensors = [tensor.clone() for _ in range(world_size)] |
| | torch.distributed.all_gather(output_tensors, tensor) |
| | concat = torch.cat(output_tensors, dim=0) |
| | |
| | return concat[:num_total_examples] |
| |
|
| |
|
| | class CosineAnnealingWarmupRestarts(_LRScheduler): |
| | """ |
| | optimizer (Optimizer): Wrapped optimizer. |
| | first_cycle_steps (int): First cycle step size. |
| | cycle_mult(float): Cycle steps magnification. Default: -1. |
| | max_lr(float): First cycle's max learning rate. Default: 0.1. |
| | min_lr(float): Min learning rate. Default: 0.001. |
| | warmup_steps(int): Linear warmup step size. Default: 0. |
| | gamma(float): Decrease rate of max learning rate by cycle. Default: 1. |
| | last_epoch (int): The index of last epoch. Default: -1. |
| | """ |
| | |
| | def __init__(self, |
| | optimizer : torch.optim.Optimizer, |
| | first_cycle_steps : int, |
| | cycle_mult : float = 1., |
| | max_lr : float = 0.1, |
| | min_lr : float = 0.001, |
| | warmup_steps : int = 0, |
| | gamma : float = 1., |
| | last_epoch : int = -1 |
| | ): |
| | assert warmup_steps < first_cycle_steps |
| | |
| | self.first_cycle_steps = first_cycle_steps |
| | self.cycle_mult = cycle_mult |
| | self.base_max_lr = max_lr |
| | self.max_lr = max_lr |
| | self.min_lr = min_lr |
| | self.warmup_steps = warmup_steps |
| | self.gamma = gamma |
| | |
| | self.cur_cycle_steps = first_cycle_steps |
| | self.cycle = 0 |
| | self.step_in_cycle = last_epoch |
| | |
| | super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch) |
| | |
| | |
| | self.init_lr() |
| |
|
| | def init_lr(self): |
| | self.base_lrs = [] |
| | for param_group in self.optimizer.param_groups: |
| | param_group['lr'] = self.min_lr |
| | self.base_lrs.append(self.min_lr) |
| | |
| | def get_lr(self): |
| | if self.step_in_cycle == -1: |
| | return self.base_lrs |
| | elif self.step_in_cycle < self.warmup_steps: |
| | return [(self.max_lr - base_lr)*self.step_in_cycle / self.warmup_steps + base_lr for base_lr in self.base_lrs] |
| | else: |
| | return [base_lr + (self.max_lr - base_lr) \ |
| | * (1 + math.cos(math.pi * (self.step_in_cycle-self.warmup_steps) \ |
| | / (self.cur_cycle_steps - self.warmup_steps))) / 2 |
| | for base_lr in self.base_lrs] |
| |
|
| | def step(self, epoch=None): |
| | if epoch is None: |
| | epoch = self.last_epoch + 1 |
| | self.step_in_cycle = self.step_in_cycle + 1 |
| | if self.step_in_cycle >= self.cur_cycle_steps: |
| | self.cycle += 1 |
| | self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps |
| | self.cur_cycle_steps = int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps |
| | else: |
| | if epoch >= self.first_cycle_steps: |
| | if self.cycle_mult == 1.: |
| | self.step_in_cycle = epoch % self.first_cycle_steps |
| | self.cycle = epoch // self.first_cycle_steps |
| | else: |
| | n = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult)) |
| | self.cycle = n |
| | self.step_in_cycle = epoch - int(self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1)) |
| | self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (n) |
| | else: |
| | self.cur_cycle_steps = self.first_cycle_steps |
| | self.step_in_cycle = epoch |
| | |
| | self.max_lr = self.base_max_lr * (self.gamma**self.cycle) |
| | self.last_epoch = math.floor(epoch) |
| | for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()): |
| | param_group['lr'] = lr |
| |
|
| |
|
| | class DistanceLoss(_WeightedLoss): |
| | """ |
| | CrossEntropyLoss with Distance Weighted |
| | """ |
| | def __init__(self, weight=None, reduction='mean', ignore_index = None): |
| | super().__init__(weight=weight, reduction=reduction) |
| | self.weight = weight |
| | self.reduction = reduction |
| | self.ignore_index = ignore_index |
| | def forward(self, inputs, targets): |
| | if len(inputs.shape) > 2: |
| | inputs = inputs.reshape(-1, inputs.size(-1)) |
| | if len(targets.shape) > 1: |
| | targets = targets.reshape(-1) |
| | if self.ignore_index is not None: |
| | keep_index = (targets != self.ignore_index).nonzero(as_tuple=True)[0] |
| | targets = torch.index_select(targets, 0, keep_index) |
| | inputs = torch.index_select(inputs, 0, keep_index) |
| | lsm = F.log_softmax(inputs, -1) |
| | targets = torch.empty(size=(targets.size(0), inputs.size(-1)), device=targets.device).fill_(0).scatter_(1, targets.data.unsqueeze(1), 1) |
| | if self.weight is not None: |
| | lsm = lsm * self.weight.unsqueeze(0) |
| | loss = -(targets * lsm).sum(-1) |
| | inputs = nn.Softmax(dim=-1)(inputs)[..., 1:-1].argmax(dim=-1) + 1 |
| | |
| | targets = nn.Softmax(dim=-1)(targets)[..., 1:-1].argmax(dim=-1) + 1 |
| | |
| | distance = abs(inputs - targets) + 1e-2 |
| | |
| | |
| | loss = loss * distance |
| | if self.reduction == 'sum': |
| | loss = loss.sum() |
| | elif self.reduction == 'mean': |
| | loss = loss.mean() |
| | return loss |
| |
|
| |
|
| | class LabelSmoothCrossEntropyLoss(_WeightedLoss): |
| | """ |
| | CrossEntropyLoss with Label Somoothing |
| | """ |
| | def __init__(self, weight=None, reduction='mean', smoothing=0.0): |
| | super().__init__(weight=weight, reduction=reduction) |
| | self.smoothing = smoothing |
| | self.weight = weight |
| | self.reduction = reduction |
| |
|
| | @staticmethod |
| | def _smooth_one_hot(targets: torch.Tensor, n_classes: int, smoothing=0.0): |
| | assert 0 <= smoothing < 1 |
| | with torch.no_grad(): |
| | targets = torch.empty(size=(targets.size(0), n_classes), |
| | device=targets.device) \ |
| | .fill_(smoothing / (n_classes - 1)) \ |
| | .scatter_(1, targets.data.unsqueeze(1), 1. - smoothing) |
| | return targets |
| |
|
| | def forward(self, inputs, targets): |
| | targets = LabelSmoothCrossEntropyLoss._smooth_one_hot(targets, inputs.size(-1), |
| | self.smoothing) |
| | lsm = F.log_softmax(inputs, -1) |
| |
|
| | if self.weight is not None: |
| | lsm = lsm * self.weight.unsqueeze(0) |
| |
|
| | loss = -(targets * lsm).sum(-1) |
| |
|
| | if self.reduction == 'sum': |
| | loss = loss.sum() |
| | elif self.reduction == 'mean': |
| | loss = loss.mean() |
| |
|
| | return loss |