import os from os.path import exists import torch import torch.nn as nn from torch.nn.functional import log_softmax, pad import math import copy import time from torch.optim.lr_scheduler import LambdaLR import pandas as pd import altair as alt from torchtext.data.functional import to_map_style_dataset from torch.utils.data import DataLoader from torchtext.vocab import build_vocab_from_iterator import torchtext.datasets as datasets import spacy import GPUtil import warnings from torch.utils.data.distributed import DistributedSampler import torch.distributed as dist import torch.multiprocessing as mp from torch.nn.parallel import DistributedDataParallel as DDP
# Set to False to skip notebook execution (e.g. for debugging) warnings.filterwarnings("ignore") RUN_EXAMPLES = True
在神经网路计算中,减少序列处理任务的计算量,是一个非常重要的问题。先前提出的网络,包括Extended Neural GPU, ByteNet and ConvS2S,设计目的都是为了解决这个问题。这些网络都以CNN为基础,并行计算所有input and output positions的hidden representations.
在这些模型中,关联两个arbitrary input or output positions,所需要的操作数量,随着位置之间的距离增加而增加。例如在ConvS2S中呈线性增长,ByteNet中呈对数增长,这种增长会使得学习较远距离的两个位置之间的依赖关系,变得非常困难。在Transformer中,这个操作次数减少到了常数级别。(尽管由于平均注意力位置加权,导致有效分辨率降低,但是Multi-Head Attention可以抵消这种影响)
defforward(self, src, tgt, src_mask, tgt_mask): "Take in and process masked src and target sequences." return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
defclones(module, N): "Produce N identical layers."
return nn.ModuleList([copy.deepcopy(module) for _ inrange(N)])
1 2 3 4 5 6 7 8 9 10 11 12 13 14
classEncoder(nn.Module): "Core encoder is a stack of N layers" def__init__(self, layer, N): super(Encoder, self).__init__() self.layers = clones(layer, N) self.norm = LayerNorm(layer.size) defforward(self, x, mask): "Pass the input (and mask) through each layer in turn." for layer in self.layers: x = layer(x, mask) return self.norm(x)
classSubLayerConnection(nn.Module): """ A residual connection followed by a layer norm. Note for code simplicity the norm is first as opposed to last. """ def__init__(self, size, dropout): super(SubLayerConnection, self).__init__() self.norm = LayerNorm(size) self.dropout = nn.Dropout1d(dropout) defforward(self, x, sublayer): "Apply residual connection to any sublayer with the same size." return x + self.dropout(sublayer(self.norm(x)))
classMultiHeadAttention(nn.Module): def__init__(self, h, d_model, dropout=0.1): "Take in model size and number of heads." super(MultiHeadAttention, self).__init__() assert d_model % h == 0 # We assume d_v always equals d_k self.d_k = d_model // h self.h = h self.linears = clones(nn.Linear(d_model, d_model), 4) self.attn = None self.dropout = nn.Dropout(p=dropout)
defforward(self, query, key, value, mask=None): "Implements Figure 2" if mask isnotNone: # Same mask applied to all h heads mask = mask.unsqueeze(1) nbatches = query.size(0)
# 1) Do all the linear projections in batch from d_model => h x d_k query, key, value = [ lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) for lin, x inzip(self.linears, (query, key, value)) ]
# 2) Apply attention on all the projected vectors in batch x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
# 3) "Concat" using a view and apply a final linear. x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k) del query del key del value return self.linears[-1](x)
defmake_model( src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1 ): "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab), )
# This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model