命名实体识别之bert+bilstm（基于tensorflow）

接下来我们继续对官方基于bert的模型进行扩展，之前的可参考：

基于bert命名实体识别（一）数据处理

命名实体识别数据预处理

命名实体识别之创建训练数据

命名实体识别之使用tensorflow的bert模型进行微调

命名实体识别之动态融合不同bert层的特征（基于tensorflow）

直接看代码：

class MyModel:
  def __init__(self, config):
    self.config = config
    # 喂入模型的数据占位符
    self.input_x_word = tf.placeholder(tf.int32, [None, None], name="input_x_word")
    self.input_x_len = tf.placeholder(tf.int32, name='input_x_len')
    self.input_mask = tf.placeholder(tf.int32, [None, None], name='input_mask')
    self.input_relation = tf.placeholder(tf.int32, [None, None], name='input_relation')  # 实体NER的真实标签
    self.keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
    self.is_training = tf.placeholder(tf.bool, None, name='is_training')
    self.initializer = initializers.xavier_initializer()
    self.lstm_dim = self.config.lstm_dim
    self.relation_num = self.config.relation_num
    self.num_steps = tf.shape(self.input_x_word)[-1]
    print("self.num_steps.shape[-1]:",tf.shape(self.input_x_word)[-1])
    self.bert_embed(bert_init=True)
  

  def biLSTM_layer(self, lstm_inputs, lstm_dim, lengths, name=None):
        """
        :param lstm_inputs: [batch_size, num_steps, emb_size]
        :return: [batch_size, num_steps, 2*lstm_dim]
        """
        with tf.name_scope("char_BiLSTM" if not name else name):
            lstm_cell = {}
            for direction in ["forward", "backward"]:
                with tf.name_scope(direction):
                    lstm_cell[direction] = rnn.CoupledInputForgetGateLSTMCell(
                        lstm_dim,
                        use_peepholes=True,
                        initializer=self.initializer,
                        state_is_tuple=True)
            outputs, final_states = tf.nn.bidirectional_dynamic_rnn(
                lstm_cell["forward"],
                lstm_cell["backward"],
                lstm_inputs,
                dtype=tf.float32,
                sequence_length=lengths)
        return tf.concat(outputs, axis=2)
  def project_layer(self, lstm_outputs, name=None):
      """
      hidden layer between lstm layer and logits
      :param lstm_outputs: [batch_size, num_steps, emb_size]
      :return: [batch_size, num_steps, num_tags]
      """
      with tf.name_scope("project" if not name else name):
          with tf.name_scope("hidden"):
              W = tf.get_variable("HW", shape=[self.lstm_dim * 2, self.lstm_dim],
                                  dtype=tf.float32, initializer=self.initializer)

              b = tf.get_variable("Hb", shape=[self.lstm_dim], dtype=tf.float32,
                                  initializer=tf.zeros_initializer())
              output = tf.reshape(lstm_outputs, shape=[-1, self.lstm_dim * 2])
              hidden = tf.tanh(tf.nn.xw_plus_b(output, W, b))

          # project to score of tags
          with tf.name_scope("logits"):
              W = tf.get_variable("LW", shape=[self.lstm_dim, self.relation_num],
                                  dtype=tf.float32, initializer=self.initializer)

              b = tf.get_variable("Lb", shape=[self.relation_num], dtype=tf.float32,
                                  initializer=tf.zeros_initializer())

              pred = tf.nn.xw_plus_b(hidden, W, b)

          return tf.reshape(pred, [-1, self.num_steps, self.relation_num], name='pred_logits')
  
  def loss_without_crf(self, output_layer, num_labels, bert_init=True):
    with tf.variable_scope("loss"):
      self.logits = output_layer
      self.probabilities = tf.nn.softmax(self.logits, axis=-1)
      log_probs = tf.nn.log_softmax(self.logits, axis=-1) # [?,11]
      print("log_probs.shape:",log_probs.shape)

      self.predictions = tf.argmax(self.logits, axis=-1, name="predictions")

      one_hot_labels = tf.one_hot(self.input_relation, depth=num_labels, dtype=tf.float32) # [?,512,11]
      #print(one_hot_labels)
      #print("one_hot_labels.shape:",one_hot_labels.shape)
      self.per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
      #print("self.per_example_loss.shape:",self.per_example_loss.shape)
      self.loss = tf.reduce_mean(self.per_example_loss)
      print(self.loss)
      #print("self.loss.shape:",self.loss.shape)
      tvars = tf.trainable_variables()
      init_checkpoint = self.config.bert_file 
      assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      if bert_init:
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
      tf.logging.info("**** Trainable Variables ****")
      for var in tvars:
        init_string = ""
        if var.name in initialized_variable_names:
            init_string = ", *INIT_FROM_CKPT*"
        print("  name = {}, shape = {}{}".format(var.name, var.shape, init_string))
      print('init bert from checkpoint: {}'.format(init_checkpoint))
      #return self.loss, self.per_example_loss, self.logits, self.probabilities
      


  def bert_embed(self, bert_init=True):
    """
    读取BERT的TF模型
    :param bert_init:
    :return:
    """
    num_labels = self.config.relation_num
    bert_config_file = self.config.bert_config_file
    bert_config = BertConfig.from_json_file(bert_config_file)

    model = BertModel(
        config=bert_config,
        is_training=self.is_training,  # 微调
        input_ids=self.input_x_word,
        input_mask=self.input_mask,
        token_type_ids=None,
        use_one_hot_embeddings=False)

    # If you want to use the token-level output, use model.get_sequence_output()
    # output_layer = model.get_pooled_output() # [?,768]
    # print("output_layer.shape:",output_layer)
    used = tf.sign(tf.abs(self.input_x_word))
    length = tf.reduce_sum(used, reduction_indices=1)
    self.lengths = tf.cast(length, tf.int32)
    output_layer = model.get_sequence_output()
    lstm_inputs = tf.nn.dropout(output_layer, 0.9)
    output_layer = self.biLSTM_layer(lstm_inputs, self.lstm_dim, self.lengths)
    output_layer = self.project_layer(output_layer)
    print("output_layer.shape:", output_layer.shape)
    self.loss_without_crf(output_layer, num_labels)
    import sys
    sys.exit(0)

结果：

WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/optimization.py:155: The name tf.train.AdamOptimizer is deprecated. Please use tf.compat.v1.train.AdamOptimizer instead.

WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

{'锞', '蚨', '螭', '荑', 'Q', '芘', '呭', '铧', '㈠', '靊', '铖', '氺', '狻', '涜', '鍙', '暍', '閮', '椴', '茀', '锎', '莳', 'Z', 'U', '幤', 'X', '瀵', '’', 'F', 'L', '魑', '鸃', '汭', '嗾', '浐', '谡', '媖', '殚', '曁', '佺', '讣', '戋', 'I', '聃', '骟', '劼', '缁', '嶶', '锛', 'T', '椱', '寳', '叒', '燚', '歃', 'G', '骝', 'O', 'V', '笪', '楒', '赟', '`', '辔', '姇', '狴', '愠', '鹆', 'W', 'S', 'N', '茕', '莜', '叕', '鑱', 'K', '犴', '菥', '塱', '捭', 'Y', 'R', '毖', '閞', '郯', '咑', '鎸', '瘜', '劖', '嗮', 'D', '猊', '囵', '旂', '忔', '亓', '”', '邴', 'E', '龉', '檩', 'B', 'J', '鸱', '狲', '犇', '谝', '茆', '旳', '噃', '钅', '祹', '渼', '魉', '瘕', '鐢', '綉', 'ue40a', '瑧', '槊', '翀', '跬', '屃', '疄', '犰', '勮', 'C', '梿', '鐜', '撺', '跶', '釆', '嚚', '铱', '镚', '戝', '罝', 'P', '亻', '“', '祼', '褴', '睚', '貹', '铗', '庒', '鍒', '姤', '圪', '浡', '帀', '綯', '龃', '讠', '‘', '猢', '睍', '斲', '屼', 'M', 'A', 'H', '诓', '簰', '雬', '俰', '玎'}
8012
{'桭', 'C', 'T', '歀', 'Q', 'D', 'G', '靊', '烎', 'P', '”', '锎', '“', 'E', 'O', 'V', '緈', 'Z', 'J', 'B', 'U', 'X', '’', 'F', 'L', '‘', 'W', '尓', 'N', 'S', 'K', '浐', '諨', 'A', 'H', 'Y', 'M', 'R', 'I'}
1105
WARNING:tensorflow:From test_bert.py:388: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

2020-12-13 14:07:47.209770: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2020-12-13 14:07:47.265991: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:47.266613: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Found device 0 with properties: 
name: Tesla T4 major: 7 minor: 5 memoryClockRate(GHz): 1.59
pciBusID: 0000:00:04.0
2020-12-13 14:07:47.266923: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-12-13 14:07:47.493085: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-12-13 14:07:47.621614: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10
2020-12-13 14:07:47.641392: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10
2020-12-13 14:07:47.925153: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10
2020-12-13 14:07:47.943921: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10
2020-12-13 14:07:48.468415: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-12-13 14:07:48.468625: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.469411: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.470004: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1767] Adding visible gpu devices: 0
2020-12-13 14:07:48.525931: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2200000000 Hz
2020-12-13 14:07:48.526210: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x270ef40 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2020-12-13 14:07:48.526244: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2020-12-13 14:07:48.677879: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.678754: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x270f100 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2020-12-13 14:07:48.678790: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2020-12-13 14:07:48.679588: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.680198: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Found device 0 with properties: 
name: Tesla T4 major: 7 minor: 5 memoryClockRate(GHz): 1.59
pciBusID: 0000:00:04.0
2020-12-13 14:07:48.680265: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-12-13 14:07:48.680295: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2020-12-13 14:07:48.680319: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcufft.so.10
2020-12-13 14:07:48.680346: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcurand.so.10
2020-12-13 14:07:48.680371: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusolver.so.10
2020-12-13 14:07:48.680393: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcusparse.so.10
2020-12-13 14:07:48.680416: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudnn.so.7
2020-12-13 14:07:48.680497: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.681158: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.681699: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1767] Adding visible gpu devices: 0
2020-12-13 14:07:48.684658: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-12-13 14:07:48.686073: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1180] Device interconnect StreamExecutor with strength 1 edge matrix:
2020-12-13 14:07:48.686103: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1186]      0 
2020-12-13 14:07:48.686114: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1199] 0:   N 
2020-12-13 14:07:48.687110: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.687768: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2020-12-13 14:07:48.688359: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2020-12-13 14:07:48.688404: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1325] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14221 MB memory) -> physical GPU (device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5)
WARNING:tensorflow:From test_bert.py:176: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

self.num_steps.shape[-1]: Tensor("strided_slice_1:0", shape=(), dtype=int32)
WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:175: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.

WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:416: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.

WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:497: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead.

WARNING:tensorflow:
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:364: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:874: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.Dense instead.
WARNING:tensorflow:From /tensorflow-1.15.2/python3.6/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.__call__` method instead.
WARNING:tensorflow:From /content/drive/My Drive/Deep-Learning-With-Python/chapter8/CCF_ner/tf_utils/bert_modeling.py:282: The name tf.erf is deprecated. Please use tf.math.erf instead.

WARNING:tensorflow:From test_bert.py:209: bidirectional_dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
WARNING:tensorflow:From /tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/rnn.py:464: dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
WARNING:tensorflow:Entity <bound method CoupledInputForgetGateLSTMCell.call of <tf_utils.rnncell.CoupledInputForgetGateLSTMCell object at 0x7f297e2d2eb8>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: module 'gast' has no attribute 'Num'
WARNING:tensorflow:From /tensorflow-1.15.2/python3.6/tensorflow_core/python/ops/rnn.py:244: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
WARNING:tensorflow:Entity <bound method CoupledInputForgetGateLSTMCell.call of <tf_utils.rnncell.CoupledInputForgetGateLSTMCell object at 0x7f297e2d2fd0>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: module 'gast' has no attribute 'Num'
WARNING:tensorflow:From test_bert.py:225: The name tf.nn.xw_plus_b is deprecated. Please use tf.compat.v1.nn.xw_plus_b instead.

output_layer.shape: (?, ?, 11)
log_probs.shape: (?, ?, 11)
self.per_example_loss.shape: (?, ?)
self.loss.shape: ()
WARNING:tensorflow:From test_bert.py:255: The name tf.trainable_variables is deprecated. Please use tf.compat.v1.trainable_variables instead.

WARNING:tensorflow:From test_bert.py:259: The name tf.train.init_from_checkpoint is deprecated. Please use tf.compat.v1.train.init_from_checkpoint instead.

WARNING:tensorflow:From test_bert.py:260: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.

  name = bert/embeddings/word_embeddings:0, shape = (21128, 768), *INIT_FROM_CKPT*
  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
  name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
  name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_xi:0, shape = (768, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_hi:0, shape = (256, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_ci:0, shape = (256, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_xo:0, shape = (768, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_ho:0, shape = (256, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_co:0, shape = (256, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_xc:0, shape = (768, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_w_hc:0, shape = (256, 256)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_b_i:0, shape = (256,)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_b_c:0, shape = (256,)
  name = bidirectional_rnn/fw/coupled_input_forget_gate_lstm_cell/_b_o:0, shape = (256,)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_xi:0, shape = (768, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_hi:0, shape = (256, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_ci:0, shape = (256, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_xo:0, shape = (768, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_ho:0, shape = (256, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_co:0, shape = (256, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_xc:0, shape = (768, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_w_hc:0, shape = (256, 256)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_b_i:0, shape = (256,)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_b_c:0, shape = (256,)
  name = bidirectional_rnn/bw/coupled_input_forget_gate_lstm_cell/_b_o:0, shape = (256,)
  name = HW:0, shape = (512, 256)
  name = Hb:0, shape = (256,)
  name = LW:0, shape = (256, 11)
  name = Lb:0, shape = (11,)
init bert from checkpoint: /content/drive/MyDrive/Deep-Learning-With-Python/chapter8/CCF_ner/bert_pretrained/bert_model.ckpt
WARNING:tensorflow:From test_bert.py:392: The name tf.train.exponential_decay is deprecated. Please use tf.compat.v1.train.exponential_decay instead.

bert train variable num: 199
normal train variable num: 26
word2vec trainable!!

说明：

我们可以直接调用官方的tensorflow的bert模型来使用bert，接下来，我们使用output_layer = model.get_sequence_output()来获得最后一层的特征，然后接下来在添加bilstm层，

对于bilstm的前向和反向的输出进行拼接后，经过一个project_layer()函数计算logits，最后再经过一个损失层计算损失和其它的一些预测的值等。同时我们要将预训练bert模型的参数导入到bert中。

这里面我们可以通过这种方式计算每个序列的长度：

used = tf.sign(tf.abs(self.input_x_word))
length = tf.reduce_sum(used, reduction_indices=1)
self.lengths = tf.cast(length, tf.int32)

当然，在喂入数据的时候，我们也已经传入了长度了，可以酌情使用。

当bert+bilstm之后，一般而言bert微调的学习率和bilstm的学习率是要设置成不同的，比如一下代码：

# 超参数设置
            global_step = tf.Variable(0, name='step', trainable=False)
            learning_rate = tf.train.exponential_decay(config.learning_rate, global_step, config.decay_step,
                                                       config.decay_rate, staircase=True)

            normal_optimizer = tf.train.AdamOptimizer(learning_rate)  # 下接结构的学习率

            all_variables = graph.get_collection('trainable_variables')
            word2vec_var_list = [x for x in all_variables if 'bert' in x.name]  # BERT的参数
            normal_var_list = [x for x in all_variables if 'bert' not in x.name]  # 下接结构的参数
            print('bert train variable num: {}'.format(len(word2vec_var_list)))
            print('normal train variable num: {}'.format(len(normal_var_list)))
            normal_op = normal_optimizer.minimize(model.loss, global_step=global_step, var_list=normal_var_list)
            num_batch = int(train_iter.num_records / config.batch_size * config.train_epoch)
            embed_step = tf.Variable(0, name='step', trainable=False)
            if word2vec_var_list:  # 对BERT微调
                print('word2vec trainable!!')
                word2vec_op, embed_learning_rate, embed_step = create_optimizer(
                    model.loss, config.embed_learning_rate, num_train_steps=num_batch,
                    num_warmup_steps=int(num_batch * 0.05) , use_tpu=False ,  variable_list=word2vec_var_list
                )

                train_op = tf.group(normal_op, word2vec_op)  # 组装BERT与下接结构参数
            else:
                train_op = normal_op

一般bert+bilstm之后还需要接一个crf（条件随机场），我们下节继续。