Pytorch-seq2seq机器翻译模型+attention

语料链接：https://pan.baidu.com/s/1wpP4t_GSyPAD6HTsIoGPZg
提取码：jqq8

数据格式如图：

导包：

import os
import sys
import math
from collections import Counter
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

import nltk
nltk.download('punkt')

1. 数据预处理

1.1 读入中英文数据

英文使用nltk的 word_tokenizer 来分词，并且使用小写字母
中文直接使用单个汉字作为基本单元

def load_data(in_file):
    cn = []
    en = []
    num_examples = 0
    with open(in_file, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip().split('	')
            
            en.append(['BOS'] + nltk.word_tokenize(line[0].lower()) + ['EOS'])
            cn.append(['BOS'] + [c for c in line[1]] + ['EOS'])
    
    return en, cn

train_file = 'nmt/en-cn/train.txt'
dev_file = 'nmt/en-cn/dev.txt'
train_en, train_cn = load_data(train_file)
dev_en, dev_cn = load_data(dev_file)

查看返回的数据内容：

print(dev_en[:2])
print(dev_cn[:2])

[['BOS', 'she', 'put', 'the', 'magazine', 'on', 'the', 'table', '.', 'EOS'], ['BOS', 'hey', ',', 'what', 'are', 'you', 'doing', 'here', '?', 'EOS']]
[['BOS', '她', '把', '雜', '誌', '放', '在', '桌', '上', '。', 'EOS'], ['BOS', '嘿', '，', '你', '在', '這', '做', '什', '麼', '？', 'EOS']]

1.2 构建单词表

UNK_IDX = 0
PAD_IDX = 1
def build_dict(sentences, max_words = 50000):
    word_count = Counter()
    for sentence in sentences:
        for word in sentence:
            word_count[word] += 1
    
    ls = word_count.most_common(max_words)   # 词频前max_words个单词(降序)
    total_words = len(ls) + 2
    
    word_dict = {w[0] : index + 2 for index, w in enumerate(ls)}  # {单词:索引}, w[0]:单词, w[1]:词频
    word_dict['UNK'] = UNK_IDX
    word_dict['PAD'] = PAD_IDX
    
    return word_dict, total_words           # total_words所有单词数, 最大50002

en_dict, en_total_words = build_dict(train_en)
cn_dict, cn_total_words = build_dict(train_cn)
inv_en_dict = {v: k for k, v in en_dict.items()}  # 英文; {索引 : 单词}
inv_cn_dict = {v: k for k, v in cn_dict.items()}  # 中文; {索引 : 字}

1.3 把单词全部转变成数字

sort_by_len=True ：是为了使得一个batch中的句子长度差不多，所以按长度排序。

def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True):
    length = len(en_sentences)
    out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_sentences]
    out_cn_sentences = [[cn_dict.get(w, 0) for w in sent] for sent in cn_sentences]

    # sort sentences by word 
    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))

    # 把中文和英文按照同样的顺序排序
    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]
    
    return out_en_sentences, out_cn_sentences

train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)
dev_en, dev_cn = encode(dev_en, dev_cn, en_dict, cn_dict)  # [[2, 168, 201, 4, 3], [], ...., [2, 5, 14, 13, 22, 9, 149, 17, 107, 24, 121, 16, 20, 267, 7, 181, 23, 15, 6, 422, 25, 220, 4, 3]]

查看返回的数据内容：

print(train_cn[2])
print([inv_cn_dict[i] for i in train_cn[2]])
print([inv_en_dict[i] for i in train_en[2]])

[2, 982, 2028, 8, 4, 3]
['BOS', '祝', '贺', '你', '。', 'EOS']
['BOS', 'congratulations', '!', 'EOS']

1.4 把全部句子分成batch

# 函数返回：一个minibatch，每个句子的索引, [[11, 4, 3, 5], [16, 7, 5, 7], ...]]
def get_minibatches(n, minibatch_size, shuffle=True):  # n是传进来的句子数
    idx_list = np.arange(0, n, minibatch_size)         # [0, 1, ..., n-1] 按minibatch_size大小分割
    if shuffle:
        np.random.shuffle(idx_list)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    return minibatches

查看上面函数的功能：

get_minibatches(100, 15)

[array([90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
array([60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]),
array([15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
array([30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]),
array([75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89]),
array([45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59])]

# seqs传入的是minibatches中的一个minibatch对应的batch_size个句子索引（嵌套列表），此处batch_size=64
def prepare_data(seqs):   
    lengths = [len(seq) for seq in seqs]
    n_samples = len(seqs)                  # n_samples个句子
    max_len = np.max(lengths)              # batch_size个句子中最长句子长度

    x = np.zeros((n_samples, max_len)).astype('int32')
    x_lengths = np.array(lengths).astype('int32') # batch中原始句子长度

    for idx, seq in enumerate(seqs):
        x[idx, :lengths[idx]] = seq   # lengths[idx]: 每个句子的索引, 长度不够补0
    
    return x, x_lengths

def gen_examples(en_sentences, cn_sentences, batch_size):
    minibatches = get_minibatches(len(en_sentences), batch_size)    
    all_ex = []
    for minibatch in minibatches:
        mb_en_sentences = [en_sentences[t] for t in minibatch]  # 一个batch中每个句子的对应编码,[[[2, 982, 8], [14,5,6],...]
        mb_cn_sentences = [cn_sentences[t] for t in minibatch]
        mb_x, mb_x_len = prepare_data(mb_en_sentences)          # 一个batch中每个句子的对应编码，长度不够补0; 一个batch中每个句子长度
        mb_y, mb_y_len = prepare_data(mb_cn_sentences)
        all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))
    # 返回内容依次是 n / batch_size 个 (batch个句子编码，batch个英文句子长度，batch个中文句子编码，batch个中文句子长度)
    return all_ex   

batch_size = 64
train_data = gen_examples(train_en, train_cn, batch_size)
dev_data = gen_examples(dev_en, dev_cn, batch_size)

2. Encoder Decoder模型（没有Attention版本）

2.1 定义计算损失的函数

# masked cross entropy loss
class LanguageModelCriterion(nn.Module):
    def __init__(self):
        super(LanguageModelCriterion, self).__init__()

    def forward(self, input, target, mask):
        # input: [64, 12, 3195] target: [64, 12]  mask: [64, 12]
        # input: (batch_size * seq_len) * vocab_size
        input = input.contiguous().view(-1, input.size(2))
        # target: batch_size * seq_len
        target = target.contiguous().view(-1, 1)
        mask = mask.contiguous().view(-1, 1)
        output = -input.gather(1, target) * mask  # 将input在1维，把target当索引进行取值
        #这里算得就是交叉熵损失，前面已经算了F.log_softmax
        #output.shape=torch.Size([768, 1])
        #因为input.gather时，target为0的地方不是零了，mask作用是把padding为0的地方重置为零，
        #因为在volab里0代表的也是一个单词，但是我们这里target尾部的0代表的不是单词
        output = torch.sum(output) / torch.sum(mask)
        # 均值损失，output前已经加了负号，所以这里还是最小化
        return output

gather

2.2 Encoder部分

Encoder模型的任务是把输入文字传入embedding层和GRU层，转换成一些hidden states作为后续的context vectors；

对 nn.utils.rnn.pack_padded_sequence 和 nn.utils.rnn.pad_packed_sequence 的理解：http://www.mamicode.com/info-detail-2493083.html

class PlainEncoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):   # 假设embedding_size=hidden_size
        super(PlainEncoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True) # batch_first=True: [batch_size, seq_len, hidden_size]
        self.dropout = nn.Dropout(dropout)
        
    # x: 一个batch的每个句子的编码
    # lengths: 每个句子的原始编码长度（未补0的长度）
    # 最后一个hidden_state要取出来作为context vector，所以需要lengths
    def forward(self, x, lengths):    
        # (排序好元素，排序好元素下标)
        sorted_len, sorted_idx = lengths.sort(0, descending=True)  # 把batch里的seq按照长度降序排列 
        x_sorted = x[sorted_idx.long()]
        embedded = self.dropout(self.embed(x_sorted))
        
        # 句子padding到一样长度的（真实句长会比padding的短）
        # 为了rnn时能取到真实长度的最后状态，先pack_padded_sequence进行处理
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(),
                                                            batch_first=True)   
        # out:[batch, seq_len, hidden_zize]
        # hidden: [num_layers=1, batch, hidden_size]
        packed_out, hidden = self.rnn(packed_embedded)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)  # 回到padding长度
        
        _, original_idx = sorted_idx.sort(0, descending=False)                  # 排序回原来的样子
        
        out = out[original_idx.long()].contiguous()            # [batch_size, seq_len, hidden_size]
        hidden = hidden[:, original_idx.long()].contiguous()   # [num_layers, batch_size, hidden_size]
        
#         print("out.shape: ", out.shape, 'hidden.shape: ', hidden.shape)
        
        return out, hidden[[-1]]  # hidden[[-1]], 相当于out[:, -1]

测试一下：（可注释掉）

# 测试维度 
p = PlainEncoder(en_total_words, 100)

mb_x = torch.from_numpy(train_data[0][0]).long()
mb_x_len = torch.from_numpy(train_data[0][1]).long()
print("数据集:", mb_x.shape, mb_x_len.shape)

o, h = p(mb_x, mb_x_len)

print(o.shape, h.shape)
print(o[:, -1].shape, '
', o[:, -1] == h)

数据集: torch.Size([64, 14]) torch.Size([64])
out.shape: torch.Size([64, 14, 100]) hidden.shape: torch.Size([1, 64, 100])
torch.Size([64, 14, 100]) torch.Size([1, 64, 100])
torch.Size([64, 100])
tensor([[[True, True, True, ..., True, True, True],
[True, True, True, ..., True, True, True],
[True, True, True, ..., True, True, True],
...,
[True, True, True, ..., True, True, True],
[True, True, True, ..., True, True, True],
[True, True, True, ..., True, True, True]]])

2.3 Decoder部分

Decoder会根据已经翻译的句子内容和context vectors，来决定下一个输出的单词；

class PlainDecoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super(PlainDecoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)  # [batch_size, seq_len, hidden_size]
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    # 和PlainEncoder的forward过程大致差不多，区别在于hidden_state不是0而是传入的
    # y:一个batch的每个中文句子编码
    # hid: hidden_state, context vectors
    def forward(self, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]
        hid = hid[:, sorted_idx.long()]
        
        # [batch_size, y_lengths, embed_size=hidden_size]
        y_sorted = self.dropout(self.embed(y_sorted))
        
        packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(),
                                                   batch_first=True)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        
        _, original_idx = sorted_idx.sort(0, descending=False)  # 原来的索引升序
        output_seq = unpacked[original_idx.long()].contiguous() # [batch_size, y_length, hidden_size]
        hid = hid[:, original_idx.long()].contiguous()          # [1, batch_size, hidden_size]
        
        output = F.log_softmax(self.fc(output_seq), -1)         # [batch_size, y_lengths, vocab_size]
        
        return output, hid

2.4 构建Seq2Seq模型

构建Seq2Seq模型把encoder, attention, decoder串到一起；

class PlainSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(PlainSeq2Seq, self).__init__()
        self.encoder = encoder 
        self.decoder = decoder
        
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_cut, hid = self.encoder(x, x_lengths)
        output, hid = self.decoder(y, y_lengths, hid)
            
        return output, None
    
    
    def translate(self, x, x_lengths, y, max_length=10):
        encoder_cut, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        # sample
        for i in range(max_length):
            # output: [batch_size, y_lengths, vocab_size]
            # 训练的时候y是一个句子，一起decoder训练
            # 测试的时候y是个一个词一个词生成的，所以这里的y是传入的第一个单词，这里是bos
            # 同理y_lengths也是1
            output, hid = self.decoder(y=y, y_lengths=torch.ones(batch_size).long().to(device), 
                                       hid=hid)
            #刚开始循环bos作为模型的首个输入单词，后续更新y，下个预测单词的输入是上个输出单词
            # output.shape = torch.Size([1, 1, 3195])
            # hid.shape = torch.Size([1, 1, 100])

            y = output.max(2)[1].view(batch_size, 1) 
            # .max(2)在第三个维度上取最大值,返回最大值和对应的位置索引，[1]取出最大值所在的索引
            preds.append(y)
            # preds = [tensor([[5]], device='cuda:0'), tensor([[24]], device='cuda:0'), ... tensor([[4]], device='cuda:0')]
        
        # torch.cat(preds, 1) = tensor([[ 5, 24,  6, 22,  7,  4,  3,  4,  3,  4]], device='cuda:0')
        return torch.cat(preds, 1), None

3. 创建模型

定义模型、损失、优化器。

dropout = 0.2
hidden_size = 100
encode = PlainEncoder(vocab_size=en_total_words, hidden_size=hidden_size, dropout=dropout)
decoder = PlainDecoder(vocab_size=cn_total_words, hidden_size=hidden_size, dropout=dropout)

model = PlainSeq2Seq(encode, decoder)
model = model.to(device)

loss_fn = LanguageModelCriterion().to(device)
optimizer = torch.optim.Adam(model.parameters())

4. 训练模型

def train(model, data, num_epochs=20):
    for epoch in range(num_epochs):
        model.train()      # 训练模式
        total_num_words = total_loss = 0.
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()  # EOS之前
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()  # BOS之后
            
            mb_y_len = torch.from_numpy(mb_y_len - 1).to(device).long()
            mb_y_len[mb_y_len <= 0] = 1
            
            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
            
            # [mb_y_len.max()]->[1, mb_y_len.max()]
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()
            
            # (pre, target, mask)
            # mb_output是句子单词的索引
            loss = loss_fn(mb_pred, mb_output, mb_out_mask)
            
            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
            
            # 更新模型
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
            optimizer.step()
            
            if it % 100 == 0:
                print("Epoch: ", epoch, 'iteration', it, 'loss:', loss.item())
            
        
        print("Epoch", epoch, "Training loss", total_loss / total_num_words)
        
        if epoch % 5 == 0:
            evaluate(model, dev_data)
    
    torch.save(model.state_dict(), 'translate_model.pt')

5. 评估模型

def evaluate(model, data):
    model.eval()
    total_num_words = total_loss = 0.
    
    with torch.no_grad():
        
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()
            mb_y_len[mb_y_len<=0] = 1

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)

            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)

            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
            
    print("Evaluation loss", total_loss / total_num_words)

训练100次：

train(model, train_data, num_epochs=100)

训练结果（training loss在不断下降）：

Epoch:  0 iteration 0 loss: 3.3647029399871826
Epoch:  0 iteration 100 loss: 3.009509563446045
Epoch:  0 iteration 200 loss: 3.782735824584961
Epoch 0 Training loss 3.1535905243275186
Evaluation loss 3.3979927680761692
Epoch:  1 iteration 0 loss: 3.3019187450408936
Epoch:  1 iteration 100 loss: 2.9146101474761963
Epoch:  1 iteration 200 loss: 3.7248971462249756
Epoch 1 Training loss 3.0795154243968996
Epoch:  2 iteration 0 loss: 3.204010009765625
Epoch:  2 iteration 100 loss: 2.863368511199951
Epoch:  2 iteration 200 loss: 3.6527459621429443
Epoch 2 Training loss 3.0103434118084182
Epoch:  3 iteration 0 loss: 3.146893262863159
Epoch:  3 iteration 100 loss: 2.759276866912842
Epoch:  3 iteration 200 loss: 3.589343309402466
Epoch 3 Training loss 2.9467000284848877
Epoch:  4 iteration 0 loss: 3.1050117015838623
Epoch:  4 iteration 100 loss: 2.708840847015381
Epoch:  4 iteration 200 loss: 3.5071861743927
Epoch 4 Training loss 2.8919197189025825
Epoch:  5 iteration 0 loss: 3.0071966648101807
Epoch:  5 iteration 100 loss: 2.6622238159179688
Epoch:  5 iteration 200 loss: 3.464808225631714
Epoch 5 Training loss 2.832557945455863
Evaluation loss 3.2545772727449775
Epoch:  6 iteration 0 loss: 2.967473268508911
Epoch:  6 iteration 100 loss: 2.586355209350586
Epoch:  6 iteration 200 loss: 3.467402696609497
Epoch 6 Training loss 2.7854216275948933
Epoch:  7 iteration 0 loss: 2.922556161880493
Epoch:  7 iteration 100 loss: 2.5442593097686768
Epoch:  7 iteration 200 loss: 3.402819871902466
Epoch 7 Training loss 2.7393553376979582
Epoch:  8 iteration 0 loss: 2.8680827617645264
Epoch:  8 iteration 100 loss: 2.4990341663360596
Epoch:  8 iteration 200 loss: 3.363720178604126
Epoch 8 Training loss 2.6976078317344734
Epoch:  9 iteration 0 loss: 2.7911880016326904
Epoch:  9 iteration 100 loss: 2.4367892742156982
Epoch:  9 iteration 200 loss: 3.3128461837768555
Epoch 9 Training loss 2.655838535325863
Epoch:  10 iteration 0 loss: 2.760638475418091
Epoch:  10 iteration 100 loss: 2.388662338256836
Epoch:  10 iteration 200 loss: 3.299316167831421
Epoch 10 Training loss 2.6183036396412334
Evaluation loss 3.179426688570673
Epoch:  11 iteration 0 loss: 2.7541329860687256
Epoch:  11 iteration 100 loss: 2.3711133003234863
Epoch:  11 iteration 200 loss: 3.2783377170562744
Epoch 11 Training loss 2.5806847991577992
Epoch:  12 iteration 0 loss: 2.672988176345825
Epoch:  12 iteration 100 loss: 2.376006841659546
Epoch:  12 iteration 200 loss: 3.1972506046295166
Epoch 12 Training loss 2.5446970471612693
Epoch:  13 iteration 0 loss: 2.6494789123535156
Epoch:  13 iteration 100 loss: 2.3170242309570312
Epoch:  13 iteration 200 loss: 3.1941475868225098
Epoch 13 Training loss 2.5119990739174747
Epoch:  14 iteration 0 loss: 2.5805208683013916
Epoch:  14 iteration 100 loss: 2.287121057510376
Epoch:  14 iteration 200 loss: 3.15193247795105
Epoch 14 Training loss 2.479404618952507
Epoch:  15 iteration 0 loss: 2.5561468601226807
Epoch:  15 iteration 100 loss: 2.263378858566284
Epoch:  15 iteration 200 loss: 3.183692216873169
Epoch 15 Training loss 2.4484731512219886
Evaluation loss 3.1426560713748
Epoch:  16 iteration 0 loss: 2.553135871887207
Epoch:  16 iteration 100 loss: 2.2017245292663574
Epoch:  16 iteration 200 loss: 3.1033968925476074
Epoch 16 Training loss 2.422065194773223
Epoch:  17 iteration 0 loss: 2.5503063201904297
Epoch:  17 iteration 100 loss: 2.1875879764556885
Epoch:  17 iteration 200 loss: 3.0571794509887695
Epoch 17 Training loss 2.392596175684612
Epoch:  18 iteration 0 loss: 2.447784900665283
Epoch:  18 iteration 100 loss: 2.146362781524658
Epoch:  18 iteration 200 loss: 3.064692974090576
Epoch 18 Training loss 2.3654149344515334
Epoch:  19 iteration 0 loss: 2.4578680992126465
Epoch:  19 iteration 100 loss: 2.1460280418395996
Epoch:  19 iteration 200 loss: 3.024839162826538
Epoch 19 Training loss 2.3424499056425168
Epoch:  20 iteration 0 loss: 2.4384076595306396
Epoch:  20 iteration 100 loss: 2.0974316596984863
Epoch:  20 iteration 200 loss: 2.9965004920959473
Epoch 20 Training loss 2.3167023499073878
Evaluation loss 3.1197055689269915
Epoch:  21 iteration 0 loss: 2.3817431926727295
Epoch:  21 iteration 100 loss: 2.0880067348480225
Epoch:  21 iteration 200 loss: 2.9751596450805664
Epoch 21 Training loss 2.290719437303847
Epoch:  22 iteration 0 loss: 2.3944735527038574
Epoch:  22 iteration 100 loss: 2.0802524089813232
Epoch:  22 iteration 200 loss: 2.9455509185791016
Epoch 22 Training loss 2.2698037450677613
Epoch:  23 iteration 0 loss: 2.3046939373016357
Epoch:  23 iteration 100 loss: 2.068814992904663
Epoch:  23 iteration 200 loss: 2.9671618938446045
Epoch 23 Training loss 2.2478544365587227
Epoch:  24 iteration 0 loss: 2.2910232543945312
Epoch:  24 iteration 100 loss: 2.0361578464508057
Epoch:  24 iteration 200 loss: 2.912736177444458
Epoch 24 Training loss 2.2235630649205875
Epoch:  25 iteration 0 loss: 2.335442304611206
Epoch:  25 iteration 100 loss: 2.0128493309020996
Epoch:  25 iteration 200 loss: 2.902696132659912
Epoch 25 Training loss 2.2045435398182813
Evaluation loss 3.1087384036663863
Epoch:  26 iteration 0 loss: 2.257906913757324
Epoch:  26 iteration 100 loss: 1.9572561979293823
Epoch:  26 iteration 200 loss: 2.8583080768585205
Epoch 26 Training loss 2.1859489336062077
Epoch:  27 iteration 0 loss: 2.240891933441162
Epoch:  27 iteration 100 loss: 1.9300264120101929
Epoch:  27 iteration 200 loss: 2.8508572578430176
Epoch 27 Training loss 2.1693027983038515
Epoch:  28 iteration 0 loss: 2.199796199798584
Epoch:  28 iteration 100 loss: 1.9422686100006104
Epoch:  28 iteration 200 loss: 2.842454195022583
Epoch 28 Training loss 2.1484814160984214
Epoch:  29 iteration 0 loss: 2.1854031085968018
Epoch:  29 iteration 100 loss: 1.9529454708099365
Epoch:  29 iteration 200 loss: 2.848923444747925
Epoch 29 Training loss 2.129414516738762
Epoch:  30 iteration 0 loss: 2.1895618438720703
Epoch:  30 iteration 100 loss: 1.871588110923767
Epoch:  30 iteration 200 loss: 2.791942834854126
Epoch 30 Training loss 2.113142051178803
Evaluation loss 3.1089972194763527
Epoch:  31 iteration 0 loss: 2.183242082595825
Epoch:  31 iteration 100 loss: 1.8810741901397705
Epoch:  31 iteration 200 loss: 2.779383897781372
Epoch 31 Training loss 2.095987657767845
Epoch:  32 iteration 0 loss: 2.0996744632720947
Epoch:  32 iteration 100 loss: 1.8364850282669067
Epoch:  32 iteration 200 loss: 2.7766530513763428
Epoch 32 Training loss 2.077641033989847
Epoch:  33 iteration 0 loss: 2.1275956630706787
Epoch:  33 iteration 100 loss: 1.8858064413070679
Epoch:  33 iteration 200 loss: 2.7581260204315186
Epoch 33 Training loss 2.060825001092984
Epoch:  34 iteration 0 loss: 2.0973703861236572
Epoch:  34 iteration 100 loss: 1.851388692855835
Epoch:  34 iteration 200 loss: 2.7524964809417725
Epoch 34 Training loss 2.0462104783610435
Epoch:  35 iteration 0 loss: 2.086354970932007
Epoch:  35 iteration 100 loss: 1.8358268737792969
Epoch:  35 iteration 200 loss: 2.731438398361206
Epoch 35 Training loss 2.0299077402768404
Evaluation loss 3.1139209169721624
Epoch:  36 iteration 0 loss: 2.0591766834259033
Epoch:  36 iteration 100 loss: 1.831368088722229
Epoch:  36 iteration 200 loss: 2.6570539474487305
Epoch 36 Training loss 2.014671925172371
Epoch:  37 iteration 0 loss: 2.035496234893799
Epoch:  37 iteration 100 loss: 1.8156630992889404
Epoch:  37 iteration 200 loss: 2.700183391571045
Epoch 37 Training loss 2.00206255805924
Epoch:  38 iteration 0 loss: 2.036298990249634
Epoch:  38 iteration 100 loss: 1.7919279336929321
Epoch:  38 iteration 200 loss: 2.638498306274414
Epoch 38 Training loss 1.983478224500046
Epoch:  39 iteration 0 loss: 2.0249581336975098
Epoch:  39 iteration 100 loss: 1.7389947175979614
Epoch:  39 iteration 200 loss: 2.7169861793518066
Epoch 39 Training loss 1.9724427386659686
Epoch:  40 iteration 0 loss: 2.0175204277038574
Epoch:  40 iteration 100 loss: 1.7219321727752686
Epoch:  40 iteration 200 loss: 2.6475744247436523
Epoch 40 Training loss 1.9562676721658385
Evaluation loss 3.1181668797161364
Epoch:  41 iteration 0 loss: 2.006847620010376
Epoch:  41 iteration 100 loss: 1.7191071510314941
Epoch:  41 iteration 200 loss: 2.6677799224853516
Epoch 41 Training loss 1.9437097878349063
Epoch:  42 iteration 0 loss: 1.9333022832870483
Epoch:  42 iteration 100 loss: 1.7141562700271606
Epoch:  42 iteration 200 loss: 2.5984952449798584
Epoch 42 Training loss 1.9283085355908671
Epoch:  43 iteration 0 loss: 1.9463298320770264
Epoch:  43 iteration 100 loss: 1.717552900314331
Epoch:  43 iteration 200 loss: 2.612987518310547
Epoch 43 Training loss 1.9148052832706421
Epoch:  44 iteration 0 loss: 1.9681422710418701
Epoch:  44 iteration 100 loss: 1.7166101932525635
Epoch:  44 iteration 200 loss: 2.593944549560547
Epoch 44 Training loss 1.9044130284488674
Epoch:  45 iteration 0 loss: 1.9368000030517578
Epoch:  45 iteration 100 loss: 1.658645749092102
Epoch:  45 iteration 200 loss: 2.593125581741333
Epoch 45 Training loss 1.8893168467190844
Evaluation loss 3.1277268276045214
Epoch:  46 iteration 0 loss: 1.8545007705688477
Epoch:  46 iteration 100 loss: 1.6403976678848267
Epoch:  46 iteration 200 loss: 2.5595622062683105
Epoch 46 Training loss 1.8757247360021512
Epoch:  47 iteration 0 loss: 1.883792519569397
Epoch:  47 iteration 100 loss: 1.6655203104019165
Epoch:  47 iteration 200 loss: 2.551154851913452
Epoch 47 Training loss 1.868178638252467
Epoch:  48 iteration 0 loss: 1.8451733589172363
Epoch:  48 iteration 100 loss: 1.6777702569961548
Epoch:  48 iteration 200 loss: 2.501884937286377
Epoch 48 Training loss 1.8518471154006044
Epoch:  49 iteration 0 loss: 1.8499925136566162
Epoch:  49 iteration 100 loss: 1.6486607789993286
Epoch:  49 iteration 200 loss: 2.524087429046631
Epoch 49 Training loss 1.8454946782718415
Epoch:  50 iteration 0 loss: 1.856377363204956
Epoch:  50 iteration 100 loss: 1.6574885845184326
Epoch:  50 iteration 200 loss: 2.501849412918091
Epoch 50 Training loss 1.8342453327073283
Evaluation loss 3.1381525688403076
Epoch:  51 iteration 0 loss: 1.8513492345809937
Epoch:  51 iteration 100 loss: 1.6156225204467773
Epoch:  51 iteration 200 loss: 2.546480178833008
Epoch 51 Training loss 1.8206363293651437
Epoch:  52 iteration 0 loss: 1.826798915863037
Epoch:  52 iteration 100 loss: 1.5861092805862427
Epoch:  52 iteration 200 loss: 2.486717462539673
Epoch 52 Training loss 1.8091440575272268
Epoch:  53 iteration 0 loss: 1.7943329811096191
Epoch:  53 iteration 100 loss: 1.599743366241455
Epoch:  53 iteration 200 loss: 2.4579596519470215
Epoch 53 Training loss 1.7989700911108664
Epoch:  54 iteration 0 loss: 1.7656499147415161
Epoch:  54 iteration 100 loss: 1.5951091051101685
Epoch:  54 iteration 200 loss: 2.4595048427581787
Epoch 54 Training loss 1.7877836588768
Epoch:  55 iteration 0 loss: 1.7756575345993042
Epoch:  55 iteration 100 loss: 1.5770317316055298
Epoch:  55 iteration 200 loss: 2.4162347316741943
Epoch 55 Training loss 1.7794164511320347
Evaluation loss 3.1487013315196815
Epoch:  56 iteration 0 loss: 1.754793643951416
Epoch:  56 iteration 100 loss: 1.546436071395874
Epoch:  56 iteration 200 loss: 2.4273550510406494
Epoch 56 Training loss 1.7669288957699174
Epoch:  57 iteration 0 loss: 1.7600376605987549
Epoch:  57 iteration 100 loss: 1.4999576807022095
Epoch:  57 iteration 200 loss: 2.439790725708008
Epoch 57 Training loss 1.7579986667589775
Epoch:  58 iteration 0 loss: 1.7710247039794922
Epoch:  58 iteration 100 loss: 1.5441653728485107
Epoch:  58 iteration 200 loss: 2.411104202270508
Epoch 58 Training loss 1.749948290134124
Epoch:  59 iteration 0 loss: 1.7791287899017334
Epoch:  59 iteration 100 loss: 1.5441499948501587
Epoch:  59 iteration 200 loss: 2.4272119998931885
Epoch 59 Training loss 1.7376091327428274
Epoch:  60 iteration 0 loss: 1.7641197443008423
Epoch:  60 iteration 100 loss: 1.505827784538269
Epoch:  60 iteration 200 loss: 2.4162049293518066
Epoch 60 Training loss 1.729162069608205
Evaluation loss 3.1680270844662357
Epoch:  61 iteration 0 loss: 1.719151258468628
Epoch:  61 iteration 100 loss: 1.500209927558899
Epoch:  61 iteration 200 loss: 2.4351766109466553
Epoch 61 Training loss 1.7190746620618302
Epoch:  62 iteration 0 loss: 1.7070326805114746
Epoch:  62 iteration 100 loss: 1.50221848487854
Epoch:  62 iteration 200 loss: 2.399951457977295
Epoch 62 Training loss 1.707298602424269
Epoch:  63 iteration 0 loss: 1.6960980892181396
Epoch:  63 iteration 100 loss: 1.4736263751983643
Epoch:  63 iteration 200 loss: 2.3375589847564697
Epoch 63 Training loss 1.7027722406700785
Epoch:  64 iteration 0 loss: 1.6605229377746582
Epoch:  64 iteration 100 loss: 1.496120572090149
Epoch:  64 iteration 200 loss: 2.377760887145996
Epoch 64 Training loss 1.6901847218926664
Epoch:  65 iteration 0 loss: 1.7002284526824951
Epoch:  65 iteration 100 loss: 1.463133454322815
Epoch:  65 iteration 200 loss: 2.377936601638794
Epoch 65 Training loss 1.6831096865487802
Evaluation loss 3.177895229637778
Epoch:  66 iteration 0 loss: 1.6268677711486816
Epoch:  66 iteration 100 loss: 1.5310866832733154
Epoch:  66 iteration 200 loss: 2.3395535945892334
Epoch 66 Training loss 1.6750581275368728
Epoch:  67 iteration 0 loss: 1.683242678642273
Epoch:  67 iteration 100 loss: 1.4536606073379517
Epoch:  67 iteration 200 loss: 2.33609938621521
Epoch 67 Training loss 1.6638375889732597
Epoch:  68 iteration 0 loss: 1.6539921760559082
Epoch:  68 iteration 100 loss: 1.4477794170379639
Epoch:  68 iteration 200 loss: 2.3414015769958496
Epoch 68 Training loss 1.6606883198725237
Epoch:  69 iteration 0 loss: 1.6292625665664673
Epoch:  69 iteration 100 loss: 1.404828667640686
Epoch:  69 iteration 200 loss: 2.321927547454834
Epoch 69 Training loss 1.6506938973182488
Epoch:  70 iteration 0 loss: 1.6185498237609863
Epoch:  70 iteration 100 loss: 1.4216632843017578
Epoch:  70 iteration 200 loss: 2.3253204822540283
Epoch 70 Training loss 1.6387621088477575
Evaluation loss 3.1902488400655886
Epoch:  71 iteration 0 loss: 1.6030402183532715
Epoch:  71 iteration 100 loss: 1.4137858152389526
Epoch:  71 iteration 200 loss: 2.3256776332855225
Epoch 71 Training loss 1.6318460844078808
Epoch:  72 iteration 0 loss: 1.6068423986434937
Epoch:  72 iteration 100 loss: 1.4504164457321167
Epoch:  72 iteration 200 loss: 2.3437039852142334
Epoch 72 Training loss 1.6246998589395558
Epoch:  73 iteration 0 loss: 1.5764877796173096
Epoch:  73 iteration 100 loss: 1.3730628490447998
Epoch:  73 iteration 200 loss: 2.264051675796509
Epoch 73 Training loss 1.6186856142415567
Epoch:  74 iteration 0 loss: 1.5833429098129272
Epoch:  74 iteration 100 loss: 1.381920576095581
Epoch:  74 iteration 200 loss: 2.2876336574554443
Epoch 74 Training loss 1.6106610198597258
Epoch:  75 iteration 0 loss: 1.5880494117736816
Epoch:  75 iteration 100 loss: 1.4044418334960938
Epoch:  75 iteration 200 loss: 2.2574541568756104
Epoch 75 Training loss 1.5998829403443475
Evaluation loss 3.205575323503987
Epoch:  76 iteration 0 loss: 1.5913504362106323
Epoch:  76 iteration 100 loss: 1.3733941316604614
Epoch:  76 iteration 200 loss: 2.273179292678833
Epoch 76 Training loss 1.5944278182877876
Epoch:  77 iteration 0 loss: 1.574967861175537
Epoch:  77 iteration 100 loss: 1.4105134010314941
Epoch:  77 iteration 200 loss: 2.260707139968872
Epoch 77 Training loss 1.5890476528108952
Epoch:  78 iteration 0 loss: 1.5877436399459839
Epoch:  78 iteration 100 loss: 1.3723187446594238
Epoch:  78 iteration 200 loss: 2.266782760620117
Epoch 78 Training loss 1.580453802036902
Epoch:  79 iteration 0 loss: 1.540144920349121
Epoch:  79 iteration 100 loss: 1.370208978652954
Epoch:  79 iteration 200 loss: 2.2479166984558105
Epoch 79 Training loss 1.5723614631359557
Epoch:  80 iteration 0 loss: 1.5240201950073242
Epoch:  80 iteration 100 loss: 1.3667224645614624
Epoch:  80 iteration 200 loss: 2.2798657417297363
Epoch 80 Training loss 1.5671947631266923
Evaluation loss 3.2182803124543784
Epoch:  81 iteration 0 loss: 1.5349093675613403
Epoch:  81 iteration 100 loss: 1.341757893562317
Epoch:  81 iteration 200 loss: 2.2628333568573
Epoch 81 Training loss 1.5582374857442876
Epoch:  82 iteration 0 loss: 1.4877135753631592
Epoch:  82 iteration 100 loss: 1.3469762802124023
Epoch:  82 iteration 200 loss: 2.2514214515686035
Epoch 82 Training loss 1.5549645483978292
Epoch:  83 iteration 0 loss: 1.5119167566299438
Epoch:  83 iteration 100 loss: 1.3386821746826172
Epoch:  83 iteration 200 loss: 2.2184598445892334
Epoch 83 Training loss 1.546844436348798
Epoch:  84 iteration 0 loss: 1.4820687770843506
Epoch:  84 iteration 100 loss: 1.3448508977890015
Epoch:  84 iteration 200 loss: 2.199396848678589
Epoch 84 Training loss 1.5380232074195026
Epoch:  85 iteration 0 loss: 1.4752027988433838
Epoch:  85 iteration 100 loss: 1.316656231880188
Epoch:  85 iteration 200 loss: 2.228752374649048
Epoch 85 Training loss 1.52975351648403
Evaluation loss 3.2336413650535087
Epoch:  86 iteration 0 loss: 1.499496340751648
Epoch:  86 iteration 100 loss: 1.3332045078277588
Epoch:  86 iteration 200 loss: 2.2489013671875
Epoch 86 Training loss 1.5249615564712846
Epoch:  87 iteration 0 loss: 1.50925874710083
Epoch:  87 iteration 100 loss: 1.3083447217941284
Epoch:  87 iteration 200 loss: 2.235308885574341
Epoch 87 Training loss 1.5197892824018502
Epoch:  88 iteration 0 loss: 1.4814422130584717
Epoch:  88 iteration 100 loss: 1.3245668411254883
Epoch:  88 iteration 200 loss: 2.193997859954834
Epoch 88 Training loss 1.5135974575387956
Epoch:  89 iteration 0 loss: 1.4810220003128052
Epoch:  89 iteration 100 loss: 1.2921677827835083
Epoch:  89 iteration 200 loss: 2.1645917892456055
Epoch 89 Training loss 1.5075664417517958
Epoch:  90 iteration 0 loss: 1.4697095155715942
Epoch:  90 iteration 100 loss: 1.2751893997192383
Epoch:  90 iteration 200 loss: 2.188906669616699
Epoch 90 Training loss 1.5008888401218585
Evaluation loss 3.2456318169295293
Epoch:  91 iteration 0 loss: 1.4636540412902832
Epoch:  91 iteration 100 loss: 1.3394463062286377
Epoch:  91 iteration 200 loss: 2.192689895629883
Epoch 91 Training loss 1.4943399774943313
Epoch:  92 iteration 0 loss: 1.4552161693572998
Epoch:  92 iteration 100 loss: 1.2322344779968262
Epoch:  92 iteration 200 loss: 2.1635537147521973
Epoch 92 Training loss 1.488440135669707
Epoch:  93 iteration 0 loss: 1.4642064571380615
Epoch:  93 iteration 100 loss: 1.2490650415420532
Epoch:  93 iteration 200 loss: 2.137782573699951
Epoch 93 Training loss 1.4828345331954083
Epoch:  94 iteration 0 loss: 1.425548791885376
Epoch:  94 iteration 100 loss: 1.2757179737091064
Epoch:  94 iteration 200 loss: 2.1594502925872803
Epoch 94 Training loss 1.47362902414513
Epoch:  95 iteration 0 loss: 1.4208916425704956
Epoch:  95 iteration 100 loss: 1.260089635848999
Epoch:  95 iteration 200 loss: 2.1245341300964355
Epoch 95 Training loss 1.468862286276855
Evaluation loss 3.265405671529478
Epoch:  96 iteration 0 loss: 1.413726568222046
Epoch:  96 iteration 100 loss: 1.2730776071548462
Epoch:  96 iteration 200 loss: 2.1034820079803467
Epoch 96 Training loss 1.464572765902645
Epoch:  97 iteration 0 loss: 1.3888133764266968
Epoch:  97 iteration 100 loss: 1.29197096824646
Epoch:  97 iteration 200 loss: 2.159865617752075
Epoch 97 Training loss 1.4591572745032382
Epoch:  98 iteration 0 loss: 1.3947553634643555
Epoch:  98 iteration 100 loss: 1.271963119506836
Epoch:  98 iteration 200 loss: 2.1502716541290283
Epoch 98 Training loss 1.4532260618277022
Epoch:  99 iteration 0 loss: 1.4218417406082153
Epoch:  99 iteration 100 loss: 1.2315309047698975
Epoch:  99 iteration 200 loss: 2.12766695022583
Epoch 99 Training loss 1.4487215552807855

6. 翻译

def translate_dev(i):
    en_sent = " ".join([inv_en_dict[w] for w in dev_en[i]])  #原来的英文
    print(en_sent)
    cn_sent = " ".join([inv_cn_dict[w] for w in dev_cn[i]])  #原来的中文
    print("".join(cn_sent))

    # 一条句子
    mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device)
    mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device)
    bos = torch.Tensor([[cn_dict["BOS"]]]).long().to(device)  # shape:[1,1], [[2]]
    
    # y_lengths: [[2]], 一个句子
    translation, attn = model.translate(mb_x, mb_x_len, bos)  # [1, 10]
    # 映射成中文
    translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)]
    trans = []
    for word in translation:
        if word != "EOS":
            trans.append(word)
        else:
            break
    print("".join(trans))           #翻译后的中文

# 导入训练好模型
model.load_state_dict(torch.load('translate_model.pt', map_location=device))
for i in range(100, 120):
    translate_dev(i)
    print()

执行结果：（样本少，且训练时间短，效果不好）

BOS you have nice skin . EOS
BOS 你 的 皮 膚 真 好 。 EOS
你只有一些蛋糕。

BOS you 're UNK correct . EOS
BOS 你 部 分 正 确 。 EOS
你可以选择。

BOS everyone admired his courage . EOS
BOS 每 個 人 都 佩 服 他 的 勇 氣 。 EOS
每個人都抨擊他的健康

BOS what time is it ? EOS
BOS 几 点 了 ？ EOS
那是什么？

BOS i 'm free tonight . EOS
BOS 我 今 晚 有 空 。 EOS
我今晚有空。

BOS here is your book . EOS
BOS 這 是 你 的 書 。 EOS
那是你的书。

BOS they are at lunch . EOS
BOS 他 们 在 吃 午 饭 。 EOS
他們正在吃午飯。

BOS this chair is UNK . EOS
BOS 這 把 椅 子 很 UNK 。 EOS
这本书非常兴奋。

BOS it 's pretty heavy . EOS
BOS 它 真 重 。 EOS
它是最好的。

BOS many attended his funeral . EOS
BOS 很 多 人 都 参 加 了 他 的 葬 礼 。 EOS
在这个男人正在看他。

BOS training will be provided . EOS
BOS 会 有 训 练 。 EOS
努力停為下雪停。

BOS someone is watching you . EOS
BOS 有 人 在 看 著 你 。 EOS
有很多就了。

BOS i slapped his face . EOS
BOS 我 摑 了 他 的 臉 。 EOS
我认为我的狗。

BOS i like UNK music . EOS
BOS 我 喜 歡 流 行 音 樂 。 EOS
我喜欢音乐。

BOS tom had no children . EOS
BOS T o m 沒 有 孩 子 。 EOS
她的父親沒有聽盲。

BOS please lock the door . EOS
BOS 請 把 門 鎖 上 。 EOS
請關上門。

BOS tom has calmed down . EOS
BOS 汤 姆 冷 静 下 来 了 。 EOS
汤姆坐在机器。

BOS please speak more loudly . EOS
BOS 請 說 大 聲 一 點 兒 。 EOS
請說話再說話。

BOS keep next sunday free . EOS
BOS 把 下 周 日 空 出 来 。 EOS
星星期天下雨。

BOS i made a mistake . EOS
BOS 我 犯 了 一 個 錯 。 EOS
我一直成為一個演員。

7. Encoder Decoder模型（含Attention版本）

7.1 Encoder

Encoder模型的任务是把输入文字传入embedding层和GRU层，转换成一些hidden states作为后续的context vectors；

class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
        super(Encoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, enc_hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(enc_hidden_size * 2, dec_hidden_size)

    def forward(self, x, lengths):
        sorted_len, sorted_idx = lengths.sort(0, descending=True)
        x_sorted = x[sorted_idx.long()]
        embedded = self.dropout(self.embed(x_sorted))
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True)
        packed_out, hid = self.rnn(packed_embedded)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        out = out[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()
        # hid: [2, batch_size, enc_hidden_size]
        
        hid = torch.cat([hid[-2], hid[-1]], dim=1) # 将最后一层的hid的双向拼接
        # hid: [batch_size, 2*enc_hidden_size]
        hid = torch.tanh(self.fc(hid)).unsqueeze(0)
        # hid: [1, batch_size, dec_hidden_size]
        # out: [batch_size, seq_len, 2*enc_hidden_size]
        return out, hid

7.2 Luong Attention

图中 (h_t) 是第t个step下GRU的输出，即output
(hat{h_s}) 是encoder后的context，用QKV模型来解释的话，query就是 (h_t)，key是 (hat{h_s})；
根据context vectors 和当前的输出hidden states，计算输出；

class Attention(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size):
        # enc_hidden_size跟Encoder的一样
        super(Attention, self).__init__()
        self.enc_hidden_size = enc_hidden_size
        self.dec_hidden_size = dec_hidden_size

        self.linear_in = nn.Linear(enc_hidden_size*2, dec_hidden_size, bias=False)
        self.linear_out = nn.Linear(enc_hidden_size*2 + dec_hidden_size, dec_hidden_size)
        
    def forward(self, output, context, mask):
        # mask = batch_size, output_len, context_len     # mask在Decoder中创建好了
        # output: batch_size, output_len, dec_hidden_size，就是Decoder的output
        # context: batch_size, context_len, 2*enc_hidden_size，就是Encoder的output 
        # 这里Encoder网络是双向的，Decoder是单向的
    
        batch_size = output.size(0)
        output_len = output.size(1)
        input_len = context.size(1) # input_len = context_len
        
        # 通过decoder的hidden states加上encoder的hidden states来计算一个分数，用于计算权重
        # batch_size, context_len, dec_hidden_size
        # 第一步，公式里的Wa先与hs做点乘，把Encoder output的enc_hidden_size换成dec_hidden_size。
        # Q: W·context
        context_in = self.linear_in(context.view(batch_size*input_len, -1)).view(                
                                    batch_size, input_len, -1) 
        
        # Q·K
        # context_in.transpose(1,2): batch_size, dec_hidden_size, context_len 
        # output: batch_size, output_len, dec_hidden_size
        attn = torch.bmm(output, context_in.transpose(1,2)) 
        # batch_size, output_len, context_len
        # 第二步，ht与上一步结果点乘，得到score

        attn.data.masked_fill(mask, -1e6)
        # .masked_fill作用请看这个链接：https://blog.csdn.net/candy134834/article/details/84594754
        # mask的维度必须和attn维度相同，mask为1的位置对应attn的位置的值替换成-1e6，
        # mask为1的意义需要看Decoder函数里面的定义

        attn = F.softmax(attn, dim=2) 
        # batch_size, output_len, context_len
        # 这个dim=2到底是怎么softmax的看下下面单元格例子
        # 第三步，计算每一个encoder的hidden states对应的权重。
        
        # context: batch_size, context_len, 2*enc_hidden_size，
        context = torch.bmm(attn, context) 
        # batch_size, output_len, 2*enc_hidden_size
        # 第四步，得出context vector是一个对于encoder输出的hidden states的一个加权平均
        
        # output: batch_size, output_len, dec_hidden_size
        output = torch.cat((context, output), dim=2) 
        # output：batch_size, output_len, 2*enc_hidden_size+dec_hidden_size
        # 第五步，将context vector和 decoder的hidden states 串起来。
        
        output = output.view(batch_size*output_len, -1)
        # output.shape = (batch_size*output_len, 2*enc_hidden_size+dec_hidden_size)
        output = torch.tanh(self.linear_out(output)) 
        # output.shape=(batch_size*output_len, dec_hidden_size)
        output = output.view(batch_size, output_len, -1)
        # output.shape=(batch_size, output_len, dec_hidden_size)
        # attn.shape = batch_size, output_len, context_len
        return output, attn

7.3 Decoder

Decoder会根据已经翻译的句子内容和context vectors，来决定下一个输出的单词；

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.attention = Attention(enc_hidden_size, dec_hidden_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.out = nn.Linear(dec_hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def create_mask(self, x_len, y_len):
        # x_len 是一个batch中文句子的长度列表
        # y_len 是一个batch英文句子的长度列表
        # a mask of shape x_len * y_len
        device = x_len.device
        max_x_len = x_len.max()
        max_y_len = y_len.max()
        
        x_mask = torch.arange(max_x_len, device=device)[None, :] < x_len[:, None]
        # print(x_mask.shape) = (batch_size, output_len) # 中文句子的mask
        y_mask = torch.arange(max_y_len, device=device)[None, :] < y_len[:, None]
        # print(y_mask.shape) = (batch_size, context_len) # 英文句子的mask
        
        mask = ( ~ x_mask[:, :, None] * y_mask[:, None, :]).byte()
        # mask = (1 - x_mask[:, :, None] * y_mask[:, None, :]).byte()
        # 1-说明取反
        # x_mask[:, :, None] = (batch_size, output_len, 1)
        # y_mask[:, None, :] =  (batch_size, 1, context_len)
        # print(mask.shape) = (batch_size, output_len, context_len)
        # 注意这个例子的*相乘不是torch.bmm矩阵点乘，只是用到了广播机制而已。
        return mask
    
    def forward(self, encoder_out, x_lengths, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]
        hid = hid[:, sorted_idx.long()]
        
        y_sorted = self.dropout(self.embed(y_sorted)) # batch_size, output_length, embed_size

        packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()

        mask = self.create_mask(y_lengths, x_lengths) # 这里真是坑，第一个参数位置是中文句子的长度列表

        output, attn = self.attention(output_seq, encoder_out, mask) 
        # output.shape=(batch_size, output_len, dec_hidden_size)
        # attn.shape = batch_size, output_len, context_len
        
        # self.out = nn.Linear(dec_hidden_size, vocab_size)
        output = F.log_softmax(self.out(output), -1) # 计算最后的输出概率
        # output =(batch_size, output_len, vocab_size)
        # 最后一个vocab_size维度 log_softmax
        # hid.shape = (1, batch_size, dec_hidden_size)
        return output, hid, attn

7.4 Seq2Seq

最后我们构建Seq2Seq模型把encoder, attention, decoder串到一起

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)
        # print(hid.shape)=torch.Size([1, batch_size, dec_hidden_size])
        # print(out.shape)=torch.Size([batch_size, seq_len, 2*enc_hidden_size])
        output, hid, attn = self.decoder(encoder_out=encoder_out, 
                    x_lengths=x_lengths,
                    y=y,
                    y_lengths=y_lengths,
                    hid=hid)
        # output =(batch_size, output_len, vocab_size)
        # hid.shape = (1, batch_size, dec_hidden_size)
        # attn.shape = (batch_size, output_len, context_len)
        return output, attn
    

    def translate(self, x, x_lengths, y, max_length=100):
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for i in range(max_length):
            output, hid, attn = self.decoder(encoder_out, 
                    x_lengths,
                    y,
                    torch.ones(batch_size).long().to(y.device),
                    hid)
            y = output.max(2)[1].view(batch_size, 1)
            preds.append(y)
            attns.append(attn)
            
        return torch.cat(preds, 1), torch.cat(attns, 1)

8. 训练函数并调用上面的train函数

dropout = 0.2
embed_size = hidden_size = 100
encoder = Encoder(vocab_size=en_total_words,
                    embed_size=embed_size,
                    enc_hidden_size=hidden_size,
                    dec_hidden_size=hidden_size,
                    dropout=dropout)
decoder = Decoder(vocab_size=cn_total_words,
                    embed_size=embed_size,
                    enc_hidden_size=hidden_size,
                    dec_hidden_size=hidden_size,
                    dropout=dropout)
model = Seq2Seq(encoder, decoder)
model = model.to(device)
loss_fn = LanguageModelCriterion().to(device)
optimizer = torch.optim.Adam(model.parameters())

train(model, train_data, num_epochs=100)

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:25: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at  /pytorch/aten/src/ATen/native/cuda/LegacyDefinitions.cpp:19.)
Epoch 0 iteration 0 loss 8.077441215515137
Epoch 0 iteration 100 loss 5.345982551574707
Epoch 0 iteration 200 loss 4.56335973739624
Epoch 0 Training loss 5.50921318013691
Evaluation loss 5.080491080824646
Epoch 1 iteration 0 loss 4.47300386428833
Epoch 1 iteration 100 loss 4.909076690673828
Epoch 1 iteration 200 loss 4.016790390014648
Epoch 1 Training loss 4.876065829219002
Epoch 2 iteration 0 loss 3.9774909019470215
Epoch 2 iteration 100 loss 4.472506046295166
Epoch 2 iteration 200 loss 3.612961530685425
Epoch 2 Training loss 4.438564572733501
Epoch 3 iteration 0 loss 3.582581043243408
Epoch 3 iteration 100 loss 4.136115074157715
Epoch 3 iteration 200 loss 3.3212907314300537
Epoch 3 Training loss 4.112743628998822
Epoch 4 iteration 0 loss 3.2368381023406982
Epoch 4 iteration 100 loss 3.8409037590026855
Epoch 4 iteration 200 loss 3.097996711730957
Epoch 4 Training loss 3.8477170270406864
Epoch 5 iteration 0 loss 3.0059776306152344
Epoch 5 iteration 100 loss 3.6137866973876953
Epoch 5 iteration 200 loss 2.8685319423675537
Epoch 5 Training loss 3.62822900357212
Evaluation loss 3.637867412334476
Epoch 6 iteration 0 loss 2.742856979370117
Epoch 6 iteration 100 loss 3.390110492706299
Epoch 6 iteration 200 loss 2.6777687072753906
Epoch 6 Training loss 3.438536527389024
Epoch 7 iteration 0 loss 2.585566759109497
Epoch 7 iteration 100 loss 3.237795352935791
Epoch 7 iteration 200 loss 2.5204241275787354
Epoch 7 Training loss 3.27654657979662
Epoch 8 iteration 0 loss 2.4295897483825684
Epoch 8 iteration 100 loss 3.1119232177734375
Epoch 8 iteration 200 loss 2.3597609996795654
Epoch 8 Training loss 3.134849904339776
Epoch 9 iteration 0 loss 2.2652432918548584
Epoch 9 iteration 100 loss 2.9519033432006836
Epoch 9 iteration 200 loss 2.217094898223877
Epoch 9 Training loss 3.0061632458155874
Epoch 10 iteration 0 loss 2.1327497959136963
Epoch 10 iteration 100 loss 2.851846694946289
Epoch 10 iteration 200 loss 2.1458141803741455
Epoch 10 Training loss 2.894426641793655
Evaluation loss 3.166497583308483
Epoch 11 iteration 0 loss 2.013716697692871
Epoch 11 iteration 100 loss 2.7616653442382812
Epoch 11 iteration 200 loss 2.0029869079589844
Epoch 11 Training loss 2.791488365026667
Epoch 12 iteration 0 loss 1.9475183486938477
Epoch 12 iteration 100 loss 2.647017240524292
Epoch 12 iteration 200 loss 1.909979224205017
Epoch 12 Training loss 2.698569336892456
Epoch 13 iteration 0 loss 1.823117733001709
Epoch 13 iteration 100 loss 2.6043999195098877
Epoch 13 iteration 200 loss 1.8382450342178345
Epoch 13 Training loss 2.616960156850951
Epoch 14 iteration 0 loss 1.7701350450515747
Epoch 14 iteration 100 loss 2.528083086013794
Epoch 14 iteration 200 loss 1.7523369789123535
Epoch 14 Training loss 2.5364692366823496
Epoch 15 iteration 0 loss 1.6475502252578735
Epoch 15 iteration 100 loss 2.4581422805786133
Epoch 15 iteration 200 loss 1.7099241018295288
Epoch 15 Training loss 2.4666260303200516
Evaluation loss 2.96595491125677
Epoch 16 iteration 0 loss 1.5571707487106323
Epoch 16 iteration 100 loss 2.3642022609710693
Epoch 16 iteration 200 loss 1.6701610088348389
Epoch 16 Training loss 2.3992404009048993
Epoch 17 iteration 0 loss 1.5091164112091064
Epoch 17 iteration 100 loss 2.3246700763702393
Epoch 17 iteration 200 loss 1.5856270790100098
Epoch 17 Training loss 2.3398954671301877
Epoch 18 iteration 0 loss 1.4500510692596436
Epoch 18 iteration 100 loss 2.3111109733581543
Epoch 18 iteration 200 loss 1.5008033514022827
Epoch 18 Training loss 2.2817300454663068
Epoch 19 iteration 0 loss 1.3648465871810913
Epoch 19 iteration 100 loss 2.2263357639312744
Epoch 19 iteration 200 loss 1.434478521347046
Epoch 19 Training loss 2.2250880660919448
Epoch 20 iteration 0 loss 1.29836106300354
Epoch 20 iteration 100 loss 2.170522928237915
Epoch 20 iteration 200 loss 1.413167119026184
Epoch 20 Training loss 2.174868286439991
Evaluation loss 2.862008639379293
Epoch 21 iteration 0 loss 1.2679147720336914
Epoch 21 iteration 100 loss 2.1024975776672363
Epoch 21 iteration 200 loss 1.3479344844818115
Epoch 21 Training loss 2.124773566655596
Epoch 22 iteration 0 loss 1.2715562582015991
Epoch 22 iteration 100 loss 2.0454132556915283
Epoch 22 iteration 200 loss 1.2550404071807861
Epoch 22 Training loss 2.0813773049198834
Epoch 23 iteration 0 loss 1.204933762550354
Epoch 23 iteration 100 loss 1.986390471458435
Epoch 23 iteration 200 loss 1.3080803155899048
Epoch 23 Training loss 2.035502688247159
Epoch 24 iteration 0 loss 1.1525975465774536
Epoch 24 iteration 100 loss 2.010538101196289
Epoch 24 iteration 200 loss 1.2282871007919312
Epoch 24 Training loss 1.9932144449453215
Epoch 25 iteration 0 loss 1.1036208868026733
Epoch 25 iteration 100 loss 1.9166961908340454
Epoch 25 iteration 200 loss 1.1343692541122437
Epoch 25 Training loss 1.9600739742604965
Evaluation loss 2.8176820923223045
Epoch 26 iteration 0 loss 1.126081109046936
Epoch 26 iteration 100 loss 1.8861745595932007
Epoch 26 iteration 200 loss 1.1452618837356567
Epoch 26 Training loss 1.9179931864284319
Epoch 27 iteration 0 loss 1.0936931371688843
Epoch 27 iteration 100 loss 1.8307372331619263
Epoch 27 iteration 200 loss 1.1571146249771118
Epoch 27 Training loss 1.8831396913691085
Epoch 28 iteration 0 loss 1.0479011535644531
Epoch 28 iteration 100 loss 1.8134833574295044
Epoch 28 iteration 200 loss 1.1056196689605713
Epoch 28 Training loss 1.8484488868290145
Epoch 29 iteration 0 loss 1.0205118656158447
Epoch 29 iteration 100 loss 1.821661353111267
Epoch 29 iteration 200 loss 1.0737680196762085
Epoch 29 Training loss 1.8186136229030332
Epoch 30 iteration 0 loss 0.9615429043769836
Epoch 30 iteration 100 loss 1.7652055025100708
Epoch 30 iteration 200 loss 0.9891017079353333
Epoch 30 Training loss 1.7838154237577641
Evaluation loss 2.791978492601989
Epoch 31 iteration 0 loss 0.9656916856765747
Epoch 31 iteration 100 loss 1.7245019674301147
Epoch 31 iteration 200 loss 1.0227261781692505
Epoch 31 Training loss 1.7579890261914233
Epoch 32 iteration 0 loss 0.950885534286499
Epoch 32 iteration 100 loss 1.7047593593597412
Epoch 32 iteration 200 loss 1.0126252174377441
Epoch 32 Training loss 1.7265817618896626
Epoch 33 iteration 0 loss 0.9383729696273804
Epoch 33 iteration 100 loss 1.7073816061019897
Epoch 33 iteration 200 loss 0.9319257736206055
Epoch 33 Training loss 1.701657226905382
Epoch 34 iteration 0 loss 0.8925782442092896
Epoch 34 iteration 100 loss 1.6764633655548096
Epoch 34 iteration 200 loss 0.9110333323478699
Epoch 34 Training loss 1.6714374329267176
Epoch 35 iteration 0 loss 0.9124199748039246
Epoch 35 iteration 100 loss 1.5932414531707764
Epoch 35 iteration 200 loss 0.9045222997665405
Epoch 35 Training loss 1.6459569074645013
Evaluation loss 2.7976669954047697
Epoch 36 iteration 0 loss 0.8820086121559143
Epoch 36 iteration 100 loss 1.5867435932159424
Epoch 36 iteration 200 loss 0.88615483045578
Epoch 36 Training loss 1.6248752288905044
Epoch 37 iteration 0 loss 0.8861231803894043
Epoch 37 iteration 100 loss 1.540147304534912
Epoch 37 iteration 200 loss 0.8625170588493347
Epoch 37 Training loss 1.6025891727084938
Epoch 38 iteration 0 loss 0.8272038698196411
Epoch 38 iteration 100 loss 1.5469865798950195
Epoch 38 iteration 200 loss 0.8701044321060181
Epoch 38 Training loss 1.5775597927062583
Epoch 39 iteration 0 loss 0.7841694951057434
Epoch 39 iteration 100 loss 1.587996244430542
Epoch 39 iteration 200 loss 0.8621845245361328
Epoch 39 Training loss 1.5550835649611023
Epoch 40 iteration 0 loss 0.7730535268783569
Epoch 40 iteration 100 loss 1.510125756263733
Epoch 40 iteration 200 loss 0.8023701906204224
Epoch 40 Training loss 1.536449474043806
Evaluation loss 2.794806465695927
Epoch 41 iteration 0 loss 0.8037686347961426
Epoch 41 iteration 100 loss 1.4897831678390503
Epoch 41 iteration 200 loss 0.791727602481842
Epoch 41 Training loss 1.5090646408452422
Epoch 42 iteration 0 loss 0.7824649214744568
Epoch 42 iteration 100 loss 1.4806140661239624
Epoch 42 iteration 200 loss 0.7969489693641663
Epoch 42 Training loss 1.4928973876534222
Epoch 43 iteration 0 loss 0.7667363286018372
Epoch 43 iteration 100 loss 1.4101524353027344
Epoch 43 iteration 200 loss 0.7620548009872437
Epoch 43 Training loss 1.4743025649328945
Epoch 44 iteration 0 loss 0.7359268069267273
Epoch 44 iteration 100 loss 1.3919748067855835
Epoch 44 iteration 200 loss 0.8053562045097351
Epoch 44 Training loss 1.4554574874191657
Epoch 45 iteration 0 loss 0.7237775921821594
Epoch 45 iteration 100 loss 1.3988888263702393
Epoch 45 iteration 200 loss 0.7393531203269958
Epoch 45 Training loss 1.4322836776244472
Evaluation loss 2.812571211478882
Epoch 46 iteration 0 loss 0.6948044300079346
Epoch 46 iteration 100 loss 1.304335594177246
Epoch 46 iteration 200 loss 0.689096987247467
Epoch 46 Training loss 1.4196053662905366
Epoch 47 iteration 0 loss 0.6662931442260742
Epoch 47 iteration 100 loss 1.3609318733215332
Epoch 47 iteration 200 loss 0.7002820372581482
Epoch 47 Training loss 1.4011935120614474
Epoch 48 iteration 0 loss 0.753171443939209
Epoch 48 iteration 100 loss 1.290736436843872
Epoch 48 iteration 200 loss 0.6648774147033691
Epoch 48 Training loss 1.3849073988196539
Epoch 49 iteration 0 loss 0.7202473878860474
Epoch 49 iteration 100 loss 1.3155896663665771
Epoch 49 iteration 200 loss 0.7304859757423401
Epoch 49 Training loss 1.3667800886861978
Epoch 50 iteration 0 loss 0.6739968061447144
Epoch 50 iteration 100 loss 1.3187365531921387
Epoch 50 iteration 200 loss 0.6818186044692993
Epoch 50 Training loss 1.3522975228605367
Evaluation loss 2.8305587463367226
Epoch 51 iteration 0 loss 0.7073860168457031
Epoch 51 iteration 100 loss 1.3020031452178955
Epoch 51 iteration 200 loss 0.6439692974090576
Epoch 51 Training loss 1.3355847990987002
Epoch 52 iteration 0 loss 0.7059903144836426
Epoch 52 iteration 100 loss 1.3240293264389038
Epoch 52 iteration 200 loss 0.6690763831138611
Epoch 52 Training loss 1.3210225660783441
Epoch 53 iteration 0 loss 0.6332668662071228
Epoch 53 iteration 100 loss 1.2513703107833862
Epoch 53 iteration 200 loss 0.6558292508125305
Epoch 53 Training loss 1.3107876620531327
Epoch 54 iteration 0 loss 0.6457605957984924
Epoch 54 iteration 100 loss 1.246716856956482
Epoch 54 iteration 200 loss 0.6521980166435242
Epoch 54 Training loss 1.2941160204924305
Epoch 55 iteration 0 loss 0.6227668523788452
Epoch 55 iteration 100 loss 1.2278225421905518
Epoch 55 iteration 200 loss 0.6727674007415771
Epoch 55 Training loss 1.2778384867442392
Evaluation loss 2.853066331010339
Epoch 56 iteration 0 loss 0.5656446814537048
Epoch 56 iteration 100 loss 1.2470365762710571
Epoch 56 iteration 200 loss 0.6154574751853943
Epoch 56 Training loss 1.2628238236702862
Epoch 57 iteration 0 loss 0.5883901119232178
Epoch 57 iteration 100 loss 1.220670461654663
Epoch 57 iteration 200 loss 0.5693823099136353
Epoch 57 Training loss 1.2493639340990528
Epoch 58 iteration 0 loss 0.5862078666687012
Epoch 58 iteration 100 loss 1.1798666715621948
Epoch 58 iteration 200 loss 0.6039236187934875
Epoch 58 Training loss 1.233422517480705
Epoch 59 iteration 0 loss 0.5904982686042786
Epoch 59 iteration 100 loss 1.1922262907028198
Epoch 59 iteration 200 loss 0.5879594087600708
Epoch 59 Training loss 1.2254928604160356
Epoch 60 iteration 0 loss 0.5759232640266418
Epoch 60 iteration 100 loss 1.153181791305542
Epoch 60 iteration 200 loss 0.5618763566017151
Epoch 60 Training loss 1.208009701754125
Evaluation loss 2.871801325149645
Epoch 61 iteration 0 loss 0.5813993215560913
Epoch 61 iteration 100 loss 1.1644539833068848
Epoch 61 iteration 200 loss 0.574725329875946
Epoch 61 Training loss 1.1981734446603696
Epoch 62 iteration 0 loss 0.54474276304245
Epoch 62 iteration 100 loss 1.172760248184204
Epoch 62 iteration 200 loss 0.5736648440361023
Epoch 62 Training loss 1.1898703442169898
Epoch 63 iteration 0 loss 0.5367869138717651
Epoch 63 iteration 100 loss 1.1455975770950317
Epoch 63 iteration 200 loss 0.5316013097763062
Epoch 63 Training loss 1.17624104425602
Epoch 64 iteration 0 loss 0.5965208411216736
Epoch 64 iteration 100 loss 1.0865147113800049
Epoch 64 iteration 200 loss 0.5165320634841919
Epoch 64 Training loss 1.1626691673104586
Epoch 65 iteration 0 loss 0.5757507085800171
Epoch 65 iteration 100 loss 1.0935884714126587
Epoch 65 iteration 200 loss 0.5055180191993713
Epoch 65 Training loss 1.1486647791128823
Evaluation loss 2.888705662914898
Epoch 66 iteration 0 loss 0.554165244102478
Epoch 66 iteration 100 loss 1.0687988996505737
Epoch 66 iteration 200 loss 0.5742641687393188
Epoch 66 Training loss 1.137105361580985
Epoch 67 iteration 0 loss 0.5457087755203247
Epoch 67 iteration 100 loss 1.0431346893310547
Epoch 67 iteration 200 loss 0.5005226731300354
Epoch 67 Training loss 1.1251085623172112
Epoch 68 iteration 0 loss 0.5115629434585571
Epoch 68 iteration 100 loss 1.0742378234863281
Epoch 68 iteration 200 loss 0.4768718481063843
Epoch 68 Training loss 1.1169700110112382
Epoch 69 iteration 0 loss 0.5225317478179932
Epoch 69 iteration 100 loss 1.041317343711853
Epoch 69 iteration 200 loss 0.534132719039917
Epoch 69 Training loss 1.1102069269037087
Epoch 70 iteration 0 loss 0.48191702365875244
Epoch 70 iteration 100 loss 1.0193127393722534
Epoch 70 iteration 200 loss 0.4716692566871643
Epoch 70 Training loss 1.0953487060532974
Evaluation loss 2.9113613200675643
Epoch 71 iteration 0 loss 0.59366375207901
Epoch 71 iteration 100 loss 1.042155146598816
Epoch 71 iteration 200 loss 0.45154234766960144
Epoch 71 Training loss 1.091857606453407
Epoch 72 iteration 0 loss 0.5238001346588135
Epoch 72 iteration 100 loss 1.027955174446106
Epoch 72 iteration 200 loss 0.5312687754631042
Epoch 72 Training loss 1.0819147441571477
Epoch 73 iteration 0 loss 0.5490065217018127
Epoch 73 iteration 100 loss 1.0117655992507935
Epoch 73 iteration 200 loss 0.5065831542015076
Epoch 73 Training loss 1.0687738825424347
Epoch 74 iteration 0 loss 0.5063045024871826
Epoch 74 iteration 100 loss 1.0293574333190918
Epoch 74 iteration 200 loss 0.5003397464752197
Epoch 74 Training loss 1.0547682162543772
Epoch 75 iteration 0 loss 0.45235222578048706
Epoch 75 iteration 100 loss 1.0297720432281494
Epoch 75 iteration 200 loss 0.4086465835571289
Epoch 75 Training loss 1.0492441391159522
Evaluation loss 2.945518095083358
Epoch 76 iteration 0 loss 0.46895310282707214
Epoch 76 iteration 100 loss 0.9821916818618774
Epoch 76 iteration 200 loss 0.48269033432006836
Epoch 76 Training loss 1.0391477853463758
Epoch 77 iteration 0 loss 0.4749329388141632
Epoch 77 iteration 100 loss 0.9370260238647461
Epoch 77 iteration 200 loss 0.5174757242202759
Epoch 77 Training loss 1.0302731247109642
Epoch 78 iteration 0 loss 0.4239536225795746
Epoch 78 iteration 100 loss 0.982223391532898
Epoch 78 iteration 200 loss 0.46800896525382996
Epoch 78 Training loss 1.02385489594265
Epoch 79 iteration 0 loss 0.5065938830375671
Epoch 79 iteration 100 loss 0.9628017544746399
Epoch 79 iteration 200 loss 0.4790896773338318
Epoch 79 Training loss 1.014064338724403
Epoch 80 iteration 0 loss 0.43752557039260864
Epoch 80 iteration 100 loss 0.8520130515098572
Epoch 80 iteration 200 loss 0.40985599160194397
Epoch 80 Training loss 1.002772340443797
Evaluation loss 2.9621174652470703
Epoch 81 iteration 0 loss 0.44454529881477356
Epoch 81 iteration 100 loss 0.9402937293052673
Epoch 81 iteration 200 loss 0.41907238960266113
Epoch 81 Training loss 0.9969750344440632
Epoch 82 iteration 0 loss 0.4125458896160126
Epoch 82 iteration 100 loss 0.9050692915916443
Epoch 82 iteration 200 loss 0.5123288035392761
Epoch 82 Training loss 0.989270733289982
Epoch 83 iteration 0 loss 0.4764525592327118
Epoch 83 iteration 100 loss 0.9303292632102966
Epoch 83 iteration 200 loss 0.44956347346305847
Epoch 83 Training loss 0.9836232322264327
Epoch 84 iteration 0 loss 0.48803961277008057
Epoch 84 iteration 100 loss 0.9711679816246033
Epoch 84 iteration 200 loss 0.44382917881011963
Epoch 84 Training loss 0.9754019522005947
Epoch 85 iteration 0 loss 0.46858376264572144
Epoch 85 iteration 100 loss 0.9077855944633484
Epoch 85 iteration 200 loss 0.4368401765823364
Epoch 85 Training loss 0.9719701637084417
Evaluation loss 2.990323471814928
Epoch 86 iteration 0 loss 0.4658893346786499
Epoch 86 iteration 100 loss 0.8741357326507568
Epoch 86 iteration 200 loss 0.423090398311615
Epoch 86 Training loss 0.9583479015194021
Epoch 87 iteration 0 loss 0.4344865381717682
Epoch 87 iteration 100 loss 0.8711681365966797
Epoch 87 iteration 200 loss 0.41789063811302185
Epoch 87 Training loss 0.9474942575734959
Epoch 88 iteration 0 loss 0.42888087034225464
Epoch 88 iteration 100 loss 0.8649926781654358
Epoch 88 iteration 200 loss 0.4007169306278229
Epoch 88 Training loss 0.9426996659812006
Epoch 89 iteration 0 loss 0.4257383942604065
Epoch 89 iteration 100 loss 0.8543802499771118
Epoch 89 iteration 200 loss 0.41755053400993347
Epoch 89 Training loss 0.9360784180891997
Epoch 90 iteration 0 loss 0.44567570090293884
Epoch 90 iteration 100 loss 0.8825702667236328
Epoch 90 iteration 200 loss 0.41934728622436523
Epoch 90 Training loss 0.9298315100552865
Evaluation loss 3.0115221658685347
Epoch 91 iteration 0 loss 0.4208157956600189
Epoch 91 iteration 100 loss 0.813216507434845
Epoch 91 iteration 200 loss 0.4040917158126831
Epoch 91 Training loss 0.9193997417003693
Epoch 92 iteration 0 loss 0.41099944710731506
Epoch 92 iteration 100 loss 0.8445271253585815
Epoch 92 iteration 200 loss 0.3656329810619354
Epoch 92 Training loss 0.9176739377176427
Epoch 93 iteration 0 loss 0.3757087290287018
Epoch 93 iteration 100 loss 0.8153252601623535
Epoch 93 iteration 200 loss 0.3429928421974182
Epoch 93 Training loss 0.908510602970967
Epoch 94 iteration 0 loss 0.42818954586982727
Epoch 94 iteration 100 loss 0.8111163377761841
Epoch 94 iteration 200 loss 0.4069685935974121
Epoch 94 Training loss 0.902406791391548
Epoch 95 iteration 0 loss 0.37496259808540344
Epoch 95 iteration 100 loss 0.7711942195892334
Epoch 95 iteration 200 loss 0.4711993336677551
Epoch 95 Training loss 0.8950450409158558
Evaluation loss 3.034074325896889
Epoch 96 iteration 0 loss 0.3465866148471832
Epoch 96 iteration 100 loss 0.7963153123855591
Epoch 96 iteration 200 loss 0.34403669834136963
Epoch 96 Training loss 0.8901747859619997
Epoch 97 iteration 0 loss 0.40915727615356445
Epoch 97 iteration 100 loss 0.8184841275215149
Epoch 97 iteration 200 loss 0.39140430092811584
Epoch 97 Training loss 0.883020128311112
Epoch 98 iteration 0 loss 0.35649484395980835
Epoch 98 iteration 100 loss 0.858453094959259
Epoch 98 iteration 200 loss 0.3666226267814636
Epoch 98 Training loss 0.8780363934074935
Epoch 99 iteration 0 loss 0.41814950108528137
Epoch 99 iteration 100 loss 0.8482405543327332
Epoch 99 iteration 200 loss 0.3461854159832001
Epoch 99 Training loss 0.8755297044370204

9. 调用上面的translate_dev函数

for i in range(100,120):
    translate_dev(i)
    print()

BOS you have nice skin . EOS
BOS 你 的 皮 膚 真 好 。 EOS
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:33: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at  /pytorch/aten/src/ATen/native/cuda/LegacyDefinitions.cpp:19.)
你最好有很多新鲜事。

BOS you 're UNK correct . EOS
BOS 你 部 分 正 确 。 EOS
你的生身。

BOS everyone admired his courage . EOS
BOS 每 個 人 都 佩 服 他 的 勇 氣 。 EOS
每個人都認釋了他的意見。

BOS what time is it ? EOS
BOS 几 点 了 ？ EOS
多少钱？

BOS i 'm free tonight . EOS
BOS 我 今 晚 有 空 。 EOS
我今晚有空。

BOS here is your book . EOS
BOS 這 是 你 的 書 。 EOS
你的書在這裡。

BOS they are at lunch . EOS
BOS 他 们 在 吃 午 饭 。 EOS
他们午吃午饭。

BOS this chair is UNK . EOS
BOS 這 把 椅 子 很 UNK 。 EOS
这里的发生是门。

BOS it 's pretty heavy . EOS
BOS 它 真 重 。 EOS
它是居机场的。

BOS many attended his funeral . EOS
BOS 很 多 人 都 参 加 了 他 的 葬 礼 。 EOS
每个人都知道他的音樂。

BOS training will be provided . EOS
BOS 会 有 训 练 。 EOS
即待有空光。

BOS someone is watching you . EOS
BOS 有 人 在 看 著 你 。 EOS
有人在看你。

BOS i slapped his face . EOS
BOS 我 摑 了 他 的 臉 。 EOS
我愛他打斷了。

BOS i like UNK music . EOS
BOS 我 喜 歡 流 行 音 樂 。 EOS
我喜欢阅读。

BOS tom had no children . EOS
BOS T o m 沒 有 孩 子 。 EOS
汤姆没有孩子。

BOS please lock the door . EOS
BOS 請 把 門 鎖 上 。 EOS
請關門門。

BOS tom has calmed down . EOS
BOS 汤 姆 冷 静 下 来 了 。 EOS
Tom有三個走。

BOS please speak more loudly . EOS
BOS 請 說 大 聲 一 點 兒 。 EOS
請講更多的聲外。

BOS keep next sunday free . EOS
BOS 把 下 周 日 空 出 来 。 EOS
下個星期一下吧。

BOS i made a mistake . EOS
BOS 我 犯 了 一 個 錯 。 EOS
我错了錯誤。