Transformer(self attention pytorch)代码

实现细节;
1.embedding 层
2.positional encoding层:添加位置信息
3,MultiHeadAttention层:encoder的self attention
4,sublayerConnection层：add&norm，使用layerNorm，
5,FeedForward层:两层全连接
6,Masked MultiHeadAttention:decoder中的self attention层，添加mask,不考虑计算当前位置的后面信息
7,MultiHeadAttention层:encoder的输出做key,value,decoder的self attention输出做query,类似于传统attention
8,generator层:最后的linear和softmax层，转为概率输出
9,预测时greedy_decode,第一个预测初始化为start字符
  1 #!/usr/bin/env python
  2 # coding: utf-8
  3 
  4 import numpy as np
  5 import torch
  6 import torch.nn as nn
  7 import torch.nn.functional as F
  8 import math
  9 import copy
 10 import time
 11 from torch.autograd import Variable
 12 import matplotlib.pyplot as plt
 13 import seaborn
 14 seaborn.set_context(context="talk")
 15 
 16 
 17 class EncoderDecoder(nn.Module):
 18     """
 19     A standard Encoder-Decoder architecture. Base for this and many
 20     other models.
 21     """
 22 
 23     def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
 24         super(EncoderDecoder, self).__init__()
 25         self.encoder = encoder
 26         self.decoder = decoder
 27         self.src_embed = src_embed
 28         self.tgt_embed = tgt_embed
 29         self.generator = generator
 30 
 31     def forward(self, src, tgt, src_mask, tgt_mask):
 32         "Take in and process masked src and target sequences."
 33         memory = self.encode(src, src_mask)
 34         ret = self.decode(memory, src_mask, tgt, tgt_mask)
 35         return ret
 36 
 37     def encode(self, src, src_mask):
 38         src_embedding = self.src_embed(src)
 39         ret = self.encoder(src_embedding, src_mask)
 40         return ret
 41 
 42     def decode(self, memory, src_mask, tgt, tgt_mask):
 43         ret = tgt_embdding = self.tgt_embed(tgt)
 44         self.decoder(tgt_embdding, memory, src_mask, tgt_mask)
 45         return ret
 46 
 47 
 48 class Generator(nn.Module):
 49     "Define standard linear + softmax generation step."
 50 
 51     def __init__(self, d_model, vocab):
 52         super(Generator, self).__init__()
 53         self.proj = nn.Linear(d_model, vocab)
 54 
 55     def forward(self, x):
 56         return F.log_softmax(self.proj(x), dim=-1)
 57 
 58 
 59 # The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1, respectively.
 60 
 61 # ## Encoder and Decoder Stacks
 62 # ### Encoder
 63 # The encoder is composed of a stack of $N=6$ identical layers.
 64 def clones(module, N):
 65     "Produce N identical layers."
 66     return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
 67 
 68 
 69 class Encoder(nn.Module):
 70     "Core encoder is a stack of N layers"
 71 
 72     def __init__(self, layer, N):
 73         super(Encoder, self).__init__()
 74         self.layers = clones(layer, N)
 75         self.norm = LayerNorm(layer.size)
 76 
 77     def forward(self, x, mask):
 78         "Pass the input (and mask) through each layer in turn."
 79         for layer in self.layers:
 80             x = layer(x, mask)
 81         return self.norm(x)
 82 
 83 
 84 #layer normalization [(cite)](https://arxiv.org/abs/1607.06450). do on 
 85 class LayerNorm(nn.Module):
 86     "Construct a layernorm module (See citation for details)."
 87     def __init__(self, features, eps=1e-6):
 88         super(LayerNorm, self).__init__()
 89         self.a_2 = nn.Parameter(torch.ones(features))
 90         self.b_2 = nn.Parameter(torch.zeros(features))
 91         self.eps = eps
 92 
 93     def forward(self, x):
 94         mean = x.mean(-1, keepdim=True)
 95         std = x.std(-1, keepdim=True)
 96         return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
 97 
 98 
 99 # That is, the output of each sub-layer is $mathrm{LayerNorm}(x + mathrm{Sublayer}(x))$, where $mathrm{Sublayer}(x)$ is the function implemented by the sub-layer itself.  We apply dropout [(cite)](http://jmlr.org/papers/v15/srivastava14a.html) to the output of each sub-layer, before it is added to the sub-layer input and normalized.
100 # To facilitate these residual connections, all sub-layers in the model, as well as the embedding layers, produce outputs of dimension $d_{	ext{model}}=512$.
101 class SublayerConnection(nn.Module):
102     """
103     A residual connection followed by a layer norm.
104     Note for code simplicity the norm is first as opposed to last.
105     """
106 
107     def __init__(self, size, dropout):
108         super(SublayerConnection, self).__init__()
109         self.norm = LayerNorm(size)
110         self.dropout = nn.Dropout(dropout)
111 
112     def forward(self, x, sublayer):
113         "Apply residual connection to any sublayer with the same size."
114         ret = x + self.dropout(sublayer(self.norm(x)))
115         return ret
116 
117 
118 # Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed-forward network.
119 class EncoderLayer(nn.Module):
120     "Encoder is made up of self-attn and feed forward (defined below)"
121 
122     def __init__(self, size, self_attn, feed_forward, dropout):
123         super(EncoderLayer, self).__init__()
124         self.self_attn = self_attn
125         self.feed_forward = feed_forward
126         self.sublayer = clones(SublayerConnection(size, dropout), 2)
127         self.size = size
128 
129     def forward(self, x, mask):
130         "Follow Figure 1 (left) for connections."
131         x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
132         # torch.Size([30, 10, 512])
133         ret = self.sublayer[1](x, self.feed_forward)
134         return ret
135 
136 
137 # ### Decoder
138 # The decoder is also composed of a stack of $N=6$ identical layers.
139 class Decoder(nn.Module):
140     "Generic N layer decoder with masking."
141 
142     def __init__(self, layer, N):
143         super(Decoder, self).__init__()
144         self.layers = clones(layer, N)
145         self.norm = LayerNorm(layer.size)
146 
147     def forward(self, x, memory, src_mask, tgt_mask):
148         for layer in self.layers:
149             x = layer(x, memory, src_mask, tgt_mask)
150         return self.norm(x)
151 
152 
153 # In addition to the two sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack.  Similar to the encoder, we employ residual connections around each of the sub-layers, followed by layer normalization.
154 class DecoderLayer(nn.Module):
155     "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
156 
157     def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
158         super(DecoderLayer, self).__init__()
159         self.size = size
160         self.self_attn = self_attn
161         self.src_attn = src_attn
162         self.feed_forward = feed_forward
163         self.sublayer = clones(SublayerConnection(size, dropout), 3)
164 
165     def forward(self, x, memory, src_mask, tgt_mask):
166         "Follow Figure 1 (right) for connections."
167         m = memory
168         x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
169         x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
170         return self.sublayer[2](x, self.feed_forward)
171 
172 
173 # ### Attention
174 # An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors.  The output is computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key.
175 # We call our particular attention "Scaled Dot-Product Attention".   The input consists of queries and keys of dimension $d_k$, and values of dimension $d_v$.  We compute the dot products of the query with all keys, divide each by $sqrt{d_k}$, and apply a softmax function to obtain the weights on the values.
176 def attention(query, key, value, mask=None, dropout=None):
177     "Compute 'Scaled Dot Product Attention'"
178     # query,key,value:torch.Size([30, 8, 10, 64])
179     # decoder mask:torch.Size([30, 1, 9, 9])
180     d_k = query.size(-1)
181     key_ = key.transpose(-2, -1)  # torch.Size([30, 8, 64, 10])
182     # torch.Size([30, 8, 10, 10])
183     scores = torch.matmul(query, key_) / math.sqrt(d_k)
184     if mask is not None:
185         # decoder scores:torch.Size([30, 8, 9, 9]),
186         scores = scores.masked_fill(mask == 0, -1e9)
187     p_attn = F.softmax(scores, dim=-1)
188     if dropout is not None:
189         p_attn = dropout(p_attn)
190     return torch.matmul(p_attn, value), p_attn
191 
192 
193 class MultiHeadedAttention(nn.Module):
194     def __init__(self, h, d_model, dropout=0.1):
195         "Take in model size and number of heads."
196         super(MultiHeadedAttention, self).__init__()
197         assert d_model % h == 0
198         # We assume d_v always equals d_k
199         self.d_k = d_model // h  # 64=512//8
200         self.h = h
201         self.linears = clones(nn.Linear(d_model, d_model), 4)
202         self.attn = None
203         self.dropout = nn.Dropout(p=dropout)
204 
205     def forward(self, query, key, value, mask=None):
206         # query,key,value:torch.Size([30, 10, 512])
207         if mask is not None:
208             # Same mask applied to all h heads.
209             mask = mask.unsqueeze(1)
210         nbatches = query.size(0)
211         # 1) Do all the linear projections in batch from d_model => h x d_k
212         query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
213                 for l, x in zip(self.linears, (query, key, value))]  # query,key,value:torch.Size([30, 8, 10, 64])
214         # 2) Apply attention on all the projected vectors in batch.
215         x, self.attn = attention(query, key, value, mask=mask,
216                                  dropout=self.dropout)
217         # 3) "Concat" using a view and apply a final linear.
218         x = x.transpose(1, 2).contiguous().view(
219             nbatches, -1, self.h * self.d_k)
220         ret = self.linears[-1](x)  # torch.Size([30, 10, 512])
221         return ret
222 
223 
224 # ### Applications of Attention in our Model
225 # The Transformer uses multi-head attention in three different ways:
226 # 1) In "encoder-decoder attention" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder.   This allows every position in the decoder to attend over all positions in the input sequence.  This mimics the typical encoder-decoder attention mechanisms in sequence-to-sequence models such as [(cite)](https://arxiv.org/abs/1609.08144).
227 # 2) The encoder contains self-attention layers.  In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder.   Each position in the encoder can attend to all positions in the previous layer of the encoder.
228 # 3) Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position.  We need to prevent leftward information flow in the decoder to preserve the auto-regressive property.  We implement this inside of scaled dot-product attention by masking out (setting to $-infty$) all values in the input of the softmax which correspond to illegal connections.
229 # ## Position-wise Feed-Forward Networks
230 class PositionwiseFeedForward(nn.Module):
231     "Implements FFN equation."
232 
233     def __init__(self, d_model, d_ff, dropout=0.1):
234         super(PositionwiseFeedForward, self).__init__()
235         self.w_1 = nn.Linear(d_model, d_ff)
236         self.w_2 = nn.Linear(d_ff, d_model)
237         self.dropout = nn.Dropout(dropout)
238 
239     def forward(self, x):
240         return self.w_2(self.dropout(F.relu(self.w_1(x))))
241 
242 
243 # ## Embeddings and Softmax
244 # Similarly to other sequence transduction models, we use learned embeddings to convert the input tokens and output tokens to vectors of dimension $d_{	ext{model}}$.  We also use the usual learned linear transformation and softmax function to convert the decoder output to predicted next-token probabilities.  In our model, we share the same weight matrix between the two embedding layers and the pre-softmax linear transformation, similar to [(cite)](https://arxiv.org/abs/1608.05859). In the embedding layers, we multiply those weights by $sqrt{d_{	ext{model}}}$.
245 class Embeddings(nn.Module):
246     def __init__(self, d_model, vocab):
247         super(Embeddings, self).__init__()
248         self.lut = nn.Embedding(vocab, d_model)  # Embedding(11, 512)
249         self.d_model = d_model
250 
251     def forward(self, x):
252         return self.lut(x) * math.sqrt(self.d_model)
253 
254 
255 # ## Positional Encoding
256 class PositionalEncoding(nn.Module):
257     "Implement the PE function."
258 
259     def __init__(self, d_model, dropout, max_len=5000):
260         super(PositionalEncoding, self).__init__()
261         self.dropout = nn.Dropout(p=dropout)
262 
263         # Compute the positional encodings once in log space.
264         pe = torch.zeros(max_len, d_model)
265         position = torch.arange(0., max_len).unsqueeze(1)
266         div_term = torch.exp(torch.arange(0., d_model, 2)
267                              * -(math.log(10000.0) / d_model))
268 
269         pe[:, 0::2] = torch.sin(position * div_term)
270         pe[:, 1::2] = torch.cos(position * div_term)
271         pe = pe.unsqueeze(0)
272         self.register_buffer('pe', pe)
273 
274     def forward(self, x):
275         x = x + Variable(self.pe[:, :x.size(1)],
276                          requires_grad=False)
277         return self.dropout(x)
278 
279 
280 # We also experimented with using learned positional embeddings [(cite)](https://arxiv.org/pdf/1705.03122.pdf) instead, and found that the two versions produced nearly identical results.  We chose the sinusoidal version because it may allow the model to extrapolate to sequence lengths longer than the ones encountered during training.
281 # ## Full Model
282 def make_model(src_vocab, tgt_vocab, N=6,
283                d_model=512, d_ff=2048, h=8, dropout=0.1):
284     "Helper: Construct a model from hyperparameters."
285     c = copy.deepcopy
286     attn = MultiHeadedAttention(h, d_model)
287     ff = PositionwiseFeedForward(d_model, d_ff, dropout)
288     position = PositionalEncoding(d_model, dropout)
289     model = EncoderDecoder(
290         Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
291         Decoder(DecoderLayer(d_model, c(attn), c(attn),
292                              c(ff), dropout), N),
293         nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
294         nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
295         Generator(d_model, tgt_vocab))
296 
297     # This was important from their code.
298     # Initialize parameters with Glorot / fan_avg.
299     for p in model.parameters():
300         if p.dim() > 1:
301             nn.init.xavier_uniform_(p)
302     return model
303 
304 # We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions.  This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position $i$ can depend only on the known outputs at positions less than $i$.
305 
306 
307 def subsequent_mask(size):
308     "Mask out subsequent positions when decoding."
309     attn_shape = (1, size, size)
310     subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
311     return torch.from_numpy(subsequent_mask) == 0
312 
313 # # Training
314 # This section describes the training regime for our models.
315 # > We stop for a quick interlude to introduce some of the tools
316 # needed to train a standard encoder decoder model. First we define a batch object that holds the src and target sentences for training, as well as constructing the masks.
317 # ## Batches and Masking
318 
319 
320 class Batch:
321     "Object for holding a batch of data with mask during training."
322 
323     def __init__(self, src, trg=None, pad=0):
324         self.src = src
325         self.src_mask = (src != pad).unsqueeze(-2)
326         if trg is not None:
327             self.trg = trg[:, :-1]
328             self.trg_y = trg[:, 1:]
329             self.trg_mask = self.make_std_mask(self.trg, pad)
330             self.ntokens = (self.trg_y != pad).data.sum()
331 
332     @staticmethod
333     def make_std_mask(tgt, pad):
334         "Create a mask to hide padding and future words."
335         tgt_mask = (tgt != pad).unsqueeze(-2)
336         tgt_mask = tgt_mask & Variable(
337             subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
338         return tgt_mask
339 
340 # Next we create a generic training and scoring function to keep track of loss. We pass in a generic loss compute function that also handles parameter updates.
341 def run_epoch(data_iter, model, loss_compute):
342     "Standard Training and Logging Function"
343     start = time.time()
344     total_tokens = 0
345     total_loss = 0
346     tokens = 0
347     for i, batch in enumerate(data_iter):
348         out = model.forward(batch.src, batch.trg, 
349                             batch.src_mask, batch.trg_mask)#torch.Size([30, 10]),torch.Size([30, 9]),torch.Size([30, 1, 10]),torch.Size([30, 9, 9])
350         
351         
352         loss = loss_compute(out, batch.trg_y, batch.ntokens)
353         total_loss += loss
354         total_tokens += batch.ntokens
355         tokens += batch.ntokens
356         if i % 50 == 1:
357             elapsed = time.time() - start
358             print("Step: %d Loss: %f" %
359                     (i, loss / batch.ntokens))
360             start = time.time()
361             tokens = 0
362 
363     return total_loss / total_tokens
364 
365 
366 # ## Optimizer
367 class NoamOpt:
368     "Optim wrapper that implements rate."
369     def __init__(self, model_size, factor, warmup, optimizer):
370         self.optimizer = optimizer
371         self._step = 0
372         self.warmup = warmup
373         self.factor = factor
374         self.model_size = model_size
375         self._rate = 0
376         
377     def step(self):
378         "Update parameters and rate"
379         self._step += 1
380         rate = self.rate()
381         for p in self.optimizer.param_groups:
382             p['lr'] = rate
383         self._rate = rate
384         self.optimizer.step()
385         
386     def rate(self, step = None):
387         "Implement `lrate` above"
388         if step is None:
389             step = self._step
390         return self.factor *(self.model_size ** (-0.5) *min(step ** (-0.5), step * self.warmup ** (-1.5)))
391         
392 def get_std_opt(model):
393     return NoamOpt(model.src_embed[0].d_model, 2, 4000,
394             torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
395 # Three settings of the lrate hyperparameters.
396 opts = [NoamOpt(512, 1, 4000, None), 
397         NoamOpt(512, 1, 8000, None),
398         NoamOpt(256, 1, 4000, None)]
399 
400 # ## Regularization                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
401 # ### Label Smoothing
402 # During training, we employed label smoothing . This hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.  
403 class LabelSmoothing(nn.Module):
404     "Implement label smoothing."
405     def __init__(self, size, padding_idx, smoothing=0.0):
406         super(LabelSmoothing, self).__init__()
407         self.criterion = nn.KLDivLoss(size_average=False)
408         self.padding_idx = padding_idx
409         self.confidence = 1.0 - smoothing
410         self.smoothing = smoothing
411         self.size = size
412         self.true_dist = None
413         
414     def forward(self, x, target):
415         assert x.size(1) == self.size
416         true_dist = x.data.clone()
417         true_dist.fill_(self.smoothing / (self.size - 2))
418         true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
419         true_dist[:, self.padding_idx] = 0
420         mask = torch.nonzero(target.data == self.padding_idx)
421         if mask.dim() > 0:
422             true_dist.index_fill_(0, mask.squeeze(), 0.0)
423         self.true_dist = true_dist
424         return self.criterion(x, Variable(true_dist, requires_grad=False))
425 
426 
427 # > Here we can see an example of how the mass is distributed to the words based on confidence. 
428 # crit = LabelSmoothing(5, 0, 0.4)
429 # predict = torch.FloatTensor([[0, 0.2, 0.7, 0.1, 0],
430 #                              [0, 0.2, 0.7, 0.1, 0], 
431 #                              [0, 0.2, 0.7, 0.1, 0]])
432 # v = crit(Variable(predict.log()), 
433 #          Variable(torch.LongTensor([2, 1, 0])))
434 
435 
436 # crit = LabelSmoothing(5, 0, 0.1)
437 # def loss(x):
438 #     d = x + 3 * 1
439 #     predict = torch.FloatTensor([[0, x / d, 1 / d, 1 / d, 1 / d],
440 #                                  ])
441 #     # print(predict)
442 #     return crit(Variable(predict.log()),
443 #                  Variable(torch.LongTensor([1]))).item()
444 
445 # # A First  Example
446 # > We can begin by trying out a simple copy-task. Given a random set of input symbols from a small vocabulary, the goal is to generate back those same symbols. 
447 # ## Synthetic Data
448 def data_gen(V, batch, nbatches):
449     "Generate random data for a src-tgt copy task."
450     for i in range(nbatches):
451         data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10)))#torch.Size([30, 10])
452         data[:, 0] = 1 #start
453         src = Variable(data, requires_grad=False)
454         tgt = Variable(data, requires_grad=False)
455         yield Batch(src, tgt, 0)
456 # data_gen(11,30,20)
457 
458 
459 # ## Loss Computation
460 class SimpleLossCompute:
461     "A simple loss compute and train function."
462     def __init__(self, generator, criterion, opt=None):
463         self.generator = generator
464         self.criterion = criterion
465         self.opt = opt
466         
467     def __call__(self, x, y, norm):
468         x = self.generator(x)
469         loss = self.criterion(x.contiguous().view(-1, x.size(-1)), 
470                               y.contiguous().view(-1)) / norm
471         loss.backward()
472         if self.opt is not None:
473             self.opt.step()
474             self.opt.optimizer.zero_grad()
475         return loss.item() * norm
476 
477 
478 # ## Greedy Decoding
479 # Train the simple copy task.
480 V = 11
481 criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
482 model = make_model(V, V, N=2)
483 model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400,
484         torch.optim.Adam(model.parameters(), lr=0.01, betas=(0.9, 0.98), eps=1e-9))
485 
486 for epoch in range(5):
487     model.train()
488     run_epoch(data_gen(V, 30, 20), model, 
489               SimpleLossCompute(model.generator, criterion, model_opt))
490     model.eval()
491     print(run_epoch(data_gen(V, 30, 5), model, 
492                     SimpleLossCompute(model.generator, criterion, None)))
493 
494 
495 #This code predicts a translation using greedy decoding for simplicity. 
496 def greedy_decode(model, src, src_mask, max_len, start_symbol):
497     memory = model.encode(src, src_mask)
498     ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)#fill start symbol
499     for i in range(max_len-1):
500         out = model.decode(memory, src_mask, 
501                            Variable(ys), 
502                            Variable(subsequent_mask(ys.size(1))
503                                     .type_as(src.data)))
504         prob = model.generator(out[:, -1])
505         _, next_word = torch.max(prob, dim = 1)
506         next_word = next_word.data[0]
507         ys = torch.cat([ys, 
508                         torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
509     return ys
510 
511 model.eval()
512 src = Variable(torch.LongTensor([[1,2,3,4,5,6,7,8,9,10]]) )
513 src_mask = Variable(torch.ones(1, 1, 10) )
514 print(greedy_decode(model, src, src_mask, max_len=10, start_symbol=1))
515 
516 
517 '''
518 # # A Real World Example
519 # 
520 # > Now we consider a real-world example using the IWSLT German-English Translation task. This task is much smaller than the WMT task considered in the paper, but it illustrates the whole system. We also show how to use multi-gpu processing to make it really fast.
521 
522 #!pip install torchtext spacy
523 #!python -m spacy download en
524 #!python -m spacy download de
525 
526 
527 # ## Training Data and Batching
528 global max_src_in_batch, max_tgt_in_batch
529 def batch_size_fn(new, count, sofar):
530     "Keep augmenting batch and calculate total number of tokens + padding."
531     global max_src_in_batch, max_tgt_in_batch
532     if count == 1:
533         max_src_in_batch = 0
534         max_tgt_in_batch = 0
535     max_src_in_batch = max(max_src_in_batch,  len(new.src))
536     max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
537     src_elements = count * max_src_in_batch
538     tgt_elements = count * max_tgt_in_batch
539 
540     return max(src_elements, tgt_elements)
541 
542 # ## Data Loading
543 # > We will load the dataset using torchtext and spacy for tokenization. 
544 
545 # For data loading.
546 from torchtext import data, datasets
547 
548 if True:
549     import spacy
550     spacy_de = spacy.load('de')
551     spacy_en = spacy.load('en')
552 
553     def tokenize_de(text):
554         return [tok.text for tok in spacy_de.tokenizer(text)]
555 
556     def tokenize_en(text):
557         return [tok.text for tok in spacy_en.tokenizer(text)]
558 
559     BOS_WORD = '<s>'
560     EOS_WORD = '</s>'
561     BLANK_WORD = "<blank>"
562     SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
563     TGT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD, 
564                      eos_token = EOS_WORD, pad_token=BLANK_WORD)
565 
566     MAX_LEN = 100
567     train, val, test = datasets.IWSLT.splits(
568         exts=('.de', '.en'), fields=(SRC, TGT), 
569         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
570             len(vars(x)['trg']) <= MAX_LEN)
571     MIN_FREQ = 2
572     SRC.build_vocab(train.src, min_freq=MIN_FREQ)
573     TGT.build_vocab(train.trg, min_freq=MIN_FREQ)
574 
575 
576 # > Batching matters a ton for speed. We want to have very evenly divided batches, with absolutely minimal padding. To do this we have to hack a bit around the default torchtext batching. This code patches their default batching to make sure we search over enough sentences to find tight batches. 
577 # ## Iterators
578 
579 class MyIterator(data.Iterator):
580     def create_batches(self):
581         if self.train:
582             def pool(d, random_shuffler):
583                 for p in data.batch(d, self.batch_size * 100):
584                     p_batch = data.batch(
585                         sorted(p, key=self.sort_key),
586                         self.batch_size, self.batch_size_fn)
587                     for b in random_shuffler(list(p_batch)):
588                         yield b
589             self.batches = pool(self.data(), self.random_shuffler)
590             
591         else:
592             self.batches = []
593             for b in data.batch(self.data(), self.batch_size,
594                                           self.batch_size_fn):
595                 self.batches.append(sorted(b, key=self.sort_key))
596 
597 def rebatch(pad_idx, batch):
598     "Fix order in torchtext to match ours"
599     src, trg = batch.src.transpose(0, 1), batch.trg.transpose(0, 1)
600     return Batch(src, trg, pad_idx)
601 
602 
603 # ## Multi-GPU Training
604 # > Finally to really target fast training, we will use multi-gpu. This code implements multi-gpu word generation. It is not specific to transformer so I won't go into too much detail. The idea is to split up word generation at training time into chunks to be processed in parallel across many different gpus. We do this using pytorch parallel primitives:
605 # 
606 # * replicate - split modules onto different gpus.
607 # * scatter - split batches onto different gpus
608 # * parallel_apply - apply module to batches on different gpus
609 # * gather - pull scattered data back onto one gpu. 
610 # * nn.DataParallel - a special module wrapper that calls these all before evaluating. 
611 # 
612 
613 # Skip if not interested in multigpu.
614 class MultiGPULossCompute:
615     "A multi-gpu loss compute and train function."
616     def __init__(self, generator, criterion, devices, opt=None, chunk_size=5):
617         # Send out to different gpus.
618         self.generator = generator
619         self.criterion = nn.parallel.replicate(criterion, 
620                                                devices=devices)
621         self.opt = opt
622         self.devices = devices
623         self.chunk_size = chunk_size
624         
625     def __call__(self, out, targets, normalize):
626         total = 0.0
627         generator = nn.parallel.replicate(self.generator, 
628                                                 devices=self.devices)
629         out_scatter = nn.parallel.scatter(out, 
630                                           target_gpus=self.devices)
631         out_grad = [[] for _ in out_scatter]
632         targets = nn.parallel.scatter(targets, 
633                                       target_gpus=self.devices)
634 
635         # Divide generating into chunks.
636         chunk_size = self.chunk_size
637         for i in range(0, out_scatter[0].size(1), chunk_size):
638             # Predict distributions
639             out_column = [[Variable(o[:, i:i+chunk_size].data, 
640                                     requires_grad=self.opt is not None)] 
641                            for o in out_scatter]
642             gen = nn.parallel.parallel_apply(generator, out_column)
643 
644             # Compute loss. 
645             y = [(g.contiguous().view(-1, g.size(-1)), 
646                   t[:, i:i+chunk_size].contiguous().view(-1)) 
647                  for g, t in zip(gen, targets)]
648             loss = nn.parallel.parallel_apply(self.criterion, y)
649 
650             # Sum and normalize loss
651             l = nn.parallel.gather(loss, 
652                                    target_device=self.devices[0])
653             l = l.sum()[0] / normalize
654             total += l.data[0]
655 
656             # Backprop loss to output of transformer
657             if self.opt is not None:
658                 l.backward()
659                 for j, l in enumerate(loss):
660                     out_grad[j].append(out_column[j][0].grad.data.clone())
661 
662         # Backprop all loss through transformer.            
663         if self.opt is not None:
664             out_grad = [Variable(torch.cat(og, dim=1)) for og in out_grad]
665             o1 = out
666             o2 = nn.parallel.gather(out_grad, 
667                                     target_device=self.devices[0])
668             o1.backward(gradient=o2)
669             self.opt.step()
670             self.opt.optimizer.zero_grad()
671         return total * normalize
672 
673 
674 # > Now we create our model, criterion, optimizer, data iterators, and paralelization
675 # GPUs to use
676 devices = [0, 1, 2, 3]
677 if True:
678     pad_idx = TGT.vocab.stoi["<blank>"]
679     model = make_model(len(SRC.vocab), len(TGT.vocab), N=6)
680     model.cuda()
681     criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1)
682     criterion.cuda()
683     BATCH_SIZE = 12000
684     train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=0,
685                             repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
686                             batch_size_fn=batch_size_fn, train=True)
687     valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=0,
688                             repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
689                             batch_size_fn=batch_size_fn, train=False)
690     model_par = nn.DataParallel(model, device_ids=devices)
691 None
692 
693 
694 # > Now we train the model. I will play with the warmup steps a bit, but everything else uses the default parameters.  On an AWS p3.8xlarge with 4 Tesla V100s, this runs at ~27,000 tokens per second with a batch size of 12,000 
695 # ## Training the System
696 #!wget https://s3.amazonaws.com/opennmt-models/iwslt.pt
697 
698 if False:
699     model_opt = NoamOpt(model.src_embed[0].d_model, 1, 2000,
700             torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
701     for epoch in range(10):
702         model_par.train()
703         run_epoch((rebatch(pad_idx, b) for b in train_iter), 
704                   model_par, 
705                   MultiGPULossCompute(model.generator, criterion, 
706                                       devices=devices, opt=model_opt))
707         model_par.eval()
708         loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), 
709                           model_par, 
710                           MultiGPULossCompute(model.generator, criterion, 
711                           devices=devices, opt=None))
712         print(loss)
713 else:
714     model = torch.load("iwslt.pt")
715 
716 
717 # > Once trained we can decode the model to produce a set of translations. Here we simply translate the first sentence in the validation set. This dataset is pretty small so the translations with greedy search are reasonably accurate. 
718 
719 for i, batch in enumerate(valid_iter):
720     src = batch.src.transpose(0, 1)[:1]
721     src_mask = (src != SRC.vocab.stoi["<blank>"]).unsqueeze(-2)
722     out = greedy_decode(model, src, src_mask, 
723                         max_len=60, start_symbol=TGT.vocab.stoi["<s>"])
724     print("Translation:", end="	")
725     for i in range(1, out.size(1)):
726         sym = TGT.vocab.itos[out[0, i]]
727         if sym == "</s>": break
728         print(sym, end =" ")
729     print()
730     print("Target:", end="	")
731     for i in range(1, batch.trg.size(0)):
732         sym = TGT.vocab.itos[batch.trg.data[i, 0]]
733         if sym == "</s>": break
734         print(sym, end =" ")
735     print()
736     break
737 
738 
739 # # Additional Components: BPE, Search, Averaging
740 
741 # > So this mostly covers the transformer model itself. There are four aspects that we didn't cover explicitly. We also have all these additional features implemented in [OpenNMT-py](https://github.com/opennmt/opennmt-py).
742 # 
743 # 
744 
745 # > 1) BPE/ Word-piece: We can use a library to first preprocess the data into subword units. See Rico Sennrich's [subword-nmt](https://github.com/rsennrich/subword-nmt) implementation. These models will transform the training data to look like this:
746 # ▁Die ▁Protokoll datei ▁kann ▁ heimlich ▁per ▁E - Mail ▁oder ▁FTP ▁an ▁einen ▁bestimmte n ▁Empfänger ▁gesendet ▁werden .
747 # > 2) Shared Embeddings: When using BPE with shared vocabulary we can share the same weight vectors between the source / target / generator. See the [(cite)](https://arxiv.org/abs/1608.05859) for details. To add this to the model simply do this:
748 
749 if False:
750     model.src_embed[0].lut.weight = model.tgt_embeddings[0].lut.weight
751     model.generator.lut.weight = model.tgt_embed[0].lut.weight
752 
753 
754 # > 3) Beam Search: This is a bit too complicated to cover here. See the [OpenNMT-py](https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/translate/Beam.py) for a pytorch implementation.
755 # > 4) Model Averaging: The paper averages the last k checkpoints to create an ensembling effect. We can do this after the fact if we have a bunch of models:
756 
757 def average(model, models):
758     "Average models into model"
759     for ps in zip(*[m.params() for m in [model] + models]):
760         p[0].copy_(torch.sum(*ps[1:]) / len(ps[1:]))
761 
762 
763 # # Results
764 # 
765 # On the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big)
766 # in Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0
767 # BLEU, establishing a new state-of-the-art BLEU score of 28.4. The configuration of this model is
768 # listed in the bottom line of Table 3. Training took 3.5 days on 8 P100 GPUs. Even our base model
769 # surpasses all previously published models and ensembles, at a fraction of the training cost of any of
770 # the competitive models.
771 # 
772 # On the WMT 2014 English-to-French translation task, our big model achieves a BLEU score of 41.0,
773 # outperforming all of the previously published single models, at less than 1/4 the training cost of the
774 # previous state-of-the-art model. The Transformer (big) model trained for English-to-French used
775 # dropout rate Pdrop = 0.1, instead of 0.3.
776 # 
777 # 
778 
779 
780 # > The code we have written here is a version of the base model. There are fully trained version of this system available here  [(Example Models)](http://opennmt.net/Models-py/).
781 # >
782 # > With the addtional extensions in the last section, the OpenNMT-py replication gets to 26.9 on EN-DE WMT. Here I have loaded in those parameters to our reimplemenation. 
783 
784 get_ipython().system('wget https://s3.amazonaws.com/opennmt-models/en-de-model.pt')
785 
786 model, SRC, TGT = torch.load("en-de-model.pt")
787 
788 model.eval()
789 sent = "▁The ▁log ▁file ▁can ▁be ▁sent ▁secret ly ▁with ▁email ▁or ▁FTP ▁to ▁a ▁specified ▁receiver".split()
790 src = torch.LongTensor([[SRC.stoi[w] for w in sent]])
791 src = Variable(src)
792 src_mask = (src != SRC.stoi["<blank>"]).unsqueeze(-2)
793 out = greedy_decode(model, src, src_mask, 
794                     max_len=60, start_symbol=TGT.stoi["<s>"])
795 print("Translation:", end="	")
796 trans = "<s> "
797 for i in range(1, out.size(1)):
798     sym = TGT.itos[out[0, i]]
799     if sym == "</s>": break
800     trans += sym + " "
801 print(trans)
802 
803 
804 # ## Attention Visualization
805 # 
806 # > Even with a greedy decoder the translation looks pretty good. We can further visualize it to see what is happening at each layer of the attention 
807 
808 tgt_sent = trans.split()
809 def draw(data, x, y, ax):
810     seaborn.heatmap(data, 
811                     xticklabels=x, square=True, yticklabels=y, vmin=0.0, vmax=1.0, 
812                     cbar=False, ax=ax)
813     
814 for layer in range(1, 6, 2):
815     fig, axs = plt.subplots(1,4, figsize=(20, 10))
816     print("Encoder Layer", layer+1)
817     for h in range(4):
818         draw(model.encoder.layers[layer].self_attn.attn[0, h].data, 
819             sent, sent if h ==0 else [], ax=axs[h])
820     plt.show()
821     
822 for layer in range(1, 6, 2):
823     fig, axs = plt.subplots(1,4, figsize=(20, 10))
824     print("Decoder Self Layer", layer+1)
825     for h in range(4):
826         draw(model.decoder.layers[layer].self_attn.attn[0, h].data[:len(tgt_sent), :len(tgt_sent)], 
827             tgt_sent, tgt_sent if h ==0 else [], ax=axs[h])
828     plt.show()
829     print("Decoder Src Layer", layer+1)
830     fig, axs = plt.subplots(1,4, figsize=(20, 10))
831     for h in range(4):
832         draw(model.decoder.layers[layer].self_attn.attn[0, h].data[:len(tgt_sent), :len(sent)], 
833             sent, tgt_sent if h ==0 else [], ax=axs[h])
834     plt.show()
835 
836 '''