【文本摘要项目】7性能提升之基于Transformer的PGN模型

背景

前一篇文章中，采用了在当前比较主流的Transformer模型，Transformer模型本质来看是基于Encoder-Decoder框架，其解码方式本质上和seq2seq模型的解码方式基本相同。seq2seq的重要缺陷之一在于其不具备生成能力，而PGN模型具备着良好的生成能力。因此，本文拟结合Transformer强大的特征抽取能力以及PGN模型的生成能力，希望能碰撞出一些火花。这两个模型的原理，前面系列文章已做具体讲解。本文着重于介绍该模型的实现。

核心内容

整体流程

整个项目的大体流程，如数据加载、训练流程、测试流程等结构，和前面的模型介绍基本相同，而本文是基于上一篇文章Transformer实现的提升，因此基本代码相同。差别在于PGN机制的融入。

整体模型

Transformer模型架构和前文内容基本相同：Encoder、Decoder、以及输出层。

class PGN_TRANSFORMER(tf.keras.Model):
    def __init__(self, params):
        super(PGN_TRANSFORMER, self).__init__()

        self.num_blocks = params["num_blocks"]
        self.batch_size = params["batch_size"]
        self.vocab_size = params["vocab_size"]
        self.num_heads = params["num_heads"]

        self.embedding = Embedding(params["vocab_size"], params["d_model"])

        self.encoder = Encoder(params["num_blocks"],
                               params["d_model"],
                               params["num_heads"],
                               params["dff"],
                               params["vocab_size"],
                               params["dropout_rate"])

        self.decoder = Decoder(params["num_blocks"],
                               params["d_model"],
                               params["num_heads"],
                               params["dff"],
                               params["vocab_size"],
                               params["dropout_rate"])

        self.final_layer = tf.keras.layers.Dense(params["vocab_size"])

    def call(self, inp, extended_inp, max_oov_len, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):

        # print('inp is ', inp)
        embed_x = self.embedding(inp)
        embed_dec = self.embedding(tar)

        enc_output = self.encoder(embed_x, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

        # dec_output.shape == (batch_size, tar_seq_len, d_model)
        dec_output, attention_weights, p_gens = self.decoder(embed_dec,
                                                             enc_output,
                                                             training,
                                                             look_ahead_mask,
                                                             dec_padding_mask)

        final_output = self.final_layer(dec_output)
        # (batch_size, tar_seq_len, target_vocab_size)
        final_output = tf.nn.softmax(final_output)

        # p_gens = tf.keras.layers.Dense(tf.concat([before_dec, dec, attn_dists[-1]], axis=-1),units=1,activation=tf.sigmoid,trainable=training,use_bias=False)
        attn_dists = attention_weights['decoder_layer{}_block2'.format(self.num_blocks)]
        # (batch_size,num_heads, targ_seq_len, inp_seq_len)

        attn_dists = tf.reduce_sum(attn_dists, axis=1) / self.num_heads
        # (batch_size, targ_seq_len, inp_seq_len)

        final_dists = calc_final_dist(extended_inp,
                                      tf.unstack(final_output, axis=1),
                                      tf.unstack(attn_dists, axis=1),
                                      tf.unstack(p_gens, axis=1),
                                      max_oov_len,
                                      self.vocab_size,
                                      self.batch_size)

        outputs = dict(logits=tf.stack(final_dists, 1), attentions=attn_dists)
        return outputs

整体架构于Transformer模型的区别在于call函数中，decoder在解码过程中，需要返回概率p_gen以及上一步解码过程中的注意力分布。而在计算最终概率分布的时候（calc_final_dist），需要综合考虑更新后的词汇表概率分布以及注意力分数。

def calc_final_dist(_enc_batch_extend_vocab, vocab_dists, attn_dists, p_gens, batch_oov_len, vocab_size, batch_size):
    """
    Calculate the final distribution, for the pointer-generator model
    Args:
    vocab_dists: The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays.
                The words are in the order they appear in the vocabulary file.
    attn_dists: The attention distributions. List length max_dec_steps of (batch_size, attn_len) arrays
    Returns:
    final_dists: The final distributions. List length max_dec_steps of (batch_size, extended_vsize) arrays.
    """
    # Multiply vocab dists by p_gen and attention dists by (1-p_gen)
    vocab_dists = [p_gen * dist for (p_gen, dist) in zip(p_gens, vocab_dists)]
    attn_dists = [(1-p_gen) * dist for (p_gen, dist) in zip(p_gens, attn_dists)]

    # Concatenate some zeros to each vocabulary dist, to hold the probabilities for in-article OOV words
    # the maximum (over the batch) size of the extended vocabulary
    extended_size = vocab_size + batch_oov_len
    extra_zeros = tf.zeros((batch_size, batch_oov_len))
    # list length max_dec_steps of shape (batch_size, extended_size)
    vocab_dists_extended = [tf.concat(axis=1, values=[dist, extra_zeros]) for dist in vocab_dists]

    # Project the values in the attention distributions onto the appropriate entries in the final distributions
    # This means that if a_i = 0.1 and the ith encoder word is w, and w has index 500 in the vocabulary
    # then we add 0.1 onto the 500th entry of the final distribution
    # This is done for each decoder timestep.
    # This is fiddly; we use tf.scatter_nd to do the projection
    batch_nums = tf.range(0, limit=batch_size)  # shape (batch_size)
    batch_nums = tf.expand_dims(batch_nums, 1)  # shape (batch_size, 1)

    attn_len = tf.shape(_enc_batch_extend_vocab)[1]  # number of states we attend over
    batch_nums = tf.tile(batch_nums, [1, attn_len])  # shape (batch_size, attn_len)
    indices = tf.stack((batch_nums, _enc_batch_extend_vocab), axis=2)  # shape (batch_size, enc_t, 2)
    shape = [batch_size, extended_size]

    # list length max_dec_steps (batch_size, extended_size)
    attn_dists_projected = [tf.scatter_nd(indices, copy_dist, shape) for copy_dist in attn_dists]

    # Add the vocab distributions and the copy distributions together to get the final distributions
    # final_dists is a list length max_dec_steps; each entry is a tensor shape (batch_size, extended_size) giving
    # the final distribution for that decoder timestep
    # Note that for decoder timesteps and examples corresponding to a [PAD] token, this is junk - ignore.
    final_dists = [vocab_dist + copy_dist for (vocab_dist, copy_dist) in zip(vocab_dists_extended, attn_dists_projected)]

    return final_dists

Decoder

Decoder部分和Transformer模型的Decoder区别在于context vector以及p_gen概率的计算。

class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.depth = self.d_model // self.num_heads

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

        self.Wh = tf.keras.layers.Dense(1)
        self.Ws = tf.keras.layers.Dense(1)
        self.Wx = tf.keras.layers.Dense(1)
        self.V = tf.keras.layers.Dense(1)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):

        attention_weights = {}
        out = self.dropout(x, training=training)

        for i in range(self.num_layers):
            out, block1, block2 = self.dec_layers[i](out, enc_output, training, look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2

        # x.shape == (batch_size, target_seq_len, d_model)

        # context vectors
        enc_out_shape = tf.shape(enc_output)
        context = tf.reshape(enc_output, (enc_out_shape[0], enc_out_shape[1], self.num_heads, self.depth))  # shape : (batch_size, input_seq_len, num_heads, depth)
        context = tf.transpose(context, [0, 2, 1, 3])  # (batch_size, num_heads, input_seq_len, depth)
        context = tf.expand_dims(context, axis=2)  # (batch_size, num_heads, 1, input_seq_len, depth)

        attn = tf.expand_dims(block2, axis=-1)  # (batch_size, num_heads, target_seq_len, input_seq_len, 1)
        context = context * attn  # (batch_size, num_heads, target_seq_len, input_seq_len, depth)
        context = tf.reduce_sum(context, axis=3)  # (batch_size, num_heads, target_seq_len, depth)
        context = tf.transpose(context, [0, 2, 1, 3])  # (batch_size, target_seq_len, num_heads, depth)
        context = tf.reshape(context, (tf.shape(context)[0], tf.shape(context)[1], self.d_model))  # (batch_size, target_seq_len, d_model)

        # P_gens computing
        a = self.Wx(x)
        b = self.Ws(out)
        c = self.Wh(context)
        p_gens = tf.sigmoid(self.V(a + b + c))

        return out, attention_weights, p_gens

完整代码

Github代码