| ''' |
| Copyright 2020 The Microsoft DeepSpeed Team |
| ''' |
|
|
| import torch.nn.init as init |
| import torch |
| from torch import nn |
| import torch.distributed as dist |
|
|
|
|
|
|
|
|
| from .gate import TopKGate |
| import copy |
| import typing |
|
|
| from .experts import FusedExperts as Experts |
|
|
|
|
| class TaskMoE(torch.nn.Module): |
| def __init__(self, |
| hidden_size, |
| expert, |
| num_experts=1, |
| k=1, |
| capacity_factor=1., |
| eval_capacity_factor=1., |
| min_capacity=4, |
| noisy_gate_policy: typing.Optional[str] = None, |
| drop_tokens: bool = True, |
| use_rts=True, |
| use_tutel: bool = False, |
| cfg=None): |
| """Initialize an MoE layer. |
| |
| Arguments: |
| hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension. |
| |
| expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear). |
| |
| num_experts (int, optional): default=1, the total number of experts per layer. |
| |
| k (int, optional): default=1, top-k gating value, only supports k=1 or k=2. |
| |
| capacity_factor (float, optional): default=1.0, the capacity of the expert at training time. |
| |
| eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. |
| |
| min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. |
| |
| noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'. |
| |
| drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity). |
| |
| use_rts (bool, optional): default=True, whether to use Random Token Selection. |
| |
| use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed). |
| """ |
|
|
| super().__init__() |
|
|
|
|
| self.num_experts = num_experts |
|
|
| if isinstance(expert, nn.Linear): |
| self.expert_type = 'linear' |
| elif isinstance(expert, nn.MultiheadAttention): |
| self.expert_type = 'attention' |
| else: |
| raise NotImplementedError('please check expert type') |
|
|
| experts = Experts(expert, cfg, num_experts) |
|
|
| self.gate = TopKGate(hidden_size, |
| num_experts, |
| k, |
| noisy_gate_policy, |
| cfg, |
| moe_type=self.expert_type) |
|
|
|
|
| self.experts = experts |
|
|
|
|
|
|
| def forward(self, hidden_states, gate_decision=None, **kwargs): |
| """ MoE forward |
| Arguments: |
| hidden_states (Tensor): input to the layer |
| Returns: |
| A tuple including output |
| * output (Tensor): output of the model |
| """ |
|
|
|
|
| if gate_decision is not None: |
| top_indices, gates = gate_decision |
| else: |
| top_indices, gates = self.gate(hidden_states, **kwargs) |
|
|
| expert_output = self.experts(hidden_states, top_indices, gates, **kwargs) |
|
|
| return expert_output, [top_indices, gates] |
|
|