NLP知识点汇总（二）

二、语言模型
　　之前讲过一次的语言模型，直接贴上链接便于复习。
　　https://www.cnblogs.com/dhName/p/11357774.html
三、CNN+RNN
　　这两个网络已经滚瓜烂熟了。
　　CNN通过距离为W的窗口不断进行卷积，之后再进行池化，最终对sentence进行语义表示。
　　RNN是将有序的文本有依赖的input到网络中。
　　这中间需要注意的是：CNN只是局部的语义卷积，并不能处理长依赖的问题。因此才出现了RNN
　　而RNN却容易出现梯度消失或爆炸的问题。之后才有了优化rnn的lstm
四、lstm+crf
　　lstm就是加了几个门的rnn。crf是条件随机场，在NN未火之前，也曾风靡一时。
　　这两个加在一起常用于解决序列标注的问题。比如ner、word segment等
　　现从原理的角度进行解释lstm+crf为什么做ner这么好。
　　转载：https://blog.csdn.net/qq_17677907/article/details/88096243
　　应用于ner的code(pytorch)
  1 import torch
  2 import torch.autograd as autograd
  3 import torch.nn as nn
  4 import torch.optim as optim
  5 
  6 torch.manual_seed(1)
  7 
  8 
  9 ####main.py
 10 START_TAG = "<START>"
 11 STOP_TAG = "<STOP>"
 12 EMBEDDING_DIM = 5
 13 HIDDEN_DIM = 4
 14 
 15 # Make up some training data
 16 training_data = [(
 17     "the wall street journal reported today that apple corporation made money".split(),
 18     "B I I I O O O B I O O".split()
 19 ), (
 20     "georgia tech is a university in georgia".split(),
 21     "B I O O O O B".split()
 22 )]
 23 
 24 word_to_ix = {}
 25 for sentence, tags in training_data:
 26     for word in sentence:
 27         if word not in word_to_ix:
 28             word_to_ix[word] = len(word_to_ix)
 29 
 30 tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}
 31 
 32 model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
 33 optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
 34 
 35 # Check predictions before training
 36 with torch.no_grad():
 37     precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
 38     precheck_tags = torch.tensor([tag_to_ix[t] for t in training_data[0][1]], dtype=torch.long)
 39     print(model(precheck_sent))
 40 
 41 # Make sure prepare_sequence from earlier in the LSTM section is loaded
 42 for epoch in range(
 43         300):  # again, normally you would NOT do 300 epochs, it is toy data
 44     for sentence, tags in training_data:
 45         # Step 1. Remember that Pytorch accumulates gradients.
 46         # We need to clear them out before each instance
 47         model.zero_grad()
 48 
 49         # Step 2. Get our inputs ready for the network, that is,
 50         # turn them into Tensors of word indices.
 51         sentence_in = prepare_sequence(sentence, word_to_ix)
 52         targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
 53 
 54         # Step 3. Run our forward pass.
 55         loss = model.neg_log_likelihood(sentence_in, targets)
 56 
 57         # Step 4. Compute the loss, gradients, and update the parameters by
 58         # calling optimizer.step()
 59         loss.backward()
 60         optimizer.step()
 61 
 62 # Check predictions after training
 63 with torch.no_grad():
 64     precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
 65     print(model(precheck_sent))
 66 
 67 
 68 
 69 
 70 
 71 
 72 #####model.py
 73 class BiLSTM_CRF(nn.Module):
 74 
 75     def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
 76         super(BiLSTM_CRF, self).__init__()
 77         self.embedding_dim = embedding_dim
 78         self.hidden_dim = hidden_dim
 79         self.vocab_size = vocab_size
 80         self.tag_to_ix = tag_to_ix
 81         self.tagset_size = len(tag_to_ix)
 82 
 83         self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
 84         self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
 85                             num_layers=1, bidirectional=True)
 86 
 87         # Maps the output of the LSTM into tag space.
 88         self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
 89 
 90         # Matrix of transition parameters.  Entry i,j is the score of
 91         # transitioning *to* i *from* j.
 92         self.transitions = nn.Parameter(
 93             torch.randn(self.tagset_size, self.tagset_size))
 94 
 95         # These two statements enforce the constraint that we never transfer
 96         # to the start tag and we never transfer from the stop tag
 97         self.transitions.data[tag_to_ix[START_TAG], :] = -10000
 98         self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
 99 
100         self.hidden = self.init_hidden()
101 
102     def init_hidden(self):
103         return (torch.randn(2, 1, self.hidden_dim // 2),
104                 torch.randn(2, 1, self.hidden_dim // 2))
105 
106     def _forward_alg(self, feats):
107         # Do the forward algorithm to compute the partition function
108         init_alphas = torch.full((1, self.tagset_size), -10000.)
109         # START_TAG has all of the score.
110         init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
111 
112         # Wrap in a variable so that we will get automatic backprop
113         forward_var = init_alphas
114 
115         # Iterate through the sentence
116         for feat in feats:
117             alphas_t = []  # The forward tensors at this timestep
118             for next_tag in range(self.tagset_size):
119                 # broadcast the emission score: it is the same regardless of
120                 # the previous tag
121                 emit_score = feat[next_tag].view(
122                     1, -1).expand(1, self.tagset_size)
123                 # the ith entry of trans_score is the score of transitioning to
124                 # next_tag from i
125                 trans_score = self.transitions[next_tag].view(1, -1)
126                 # The ith entry of next_tag_var is the value for the
127                 # edge (i -> next_tag) before we do log-sum-exp
128                 next_tag_var = forward_var + trans_score + emit_score
129                 # The forward variable for this tag is log-sum-exp of all the
130                 # scores.
131                 alphas_t.append(log_sum_exp(next_tag_var).view(1))
132             forward_var = torch.cat(alphas_t).view(1, -1)
133         terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
134         alpha = log_sum_exp(terminal_var)
135         return alpha
136 
137     def _get_lstm_features(self, sentence):
138         self.hidden = self.init_hidden()
139         embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
140         lstm_out, self.hidden = self.lstm(embeds, self.hidden)
141         lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
142         lstm_feats = self.hidden2tag(lstm_out)
143         return lstm_feats
144 
145     def _score_sentence(self, feats, tags):
146         # Gives the score of a provided tag sequence
147         score = torch.zeros(1)
148         tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
149         for i, feat in enumerate(feats):
150             score = score + 
151                 self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
152         score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
153         return score
154 
155     def _viterbi_decode(self, feats):
156         backpointers = []
157 
158         # Initialize the viterbi variables in log space
159         init_vvars = torch.full((1, self.tagset_size), -10000.)
160         init_vvars[0][self.tag_to_ix[START_TAG]] = 0
161 
162         # forward_var at step i holds the viterbi variables for step i-1
163         forward_var = init_vvars
164         for feat in feats:
165             bptrs_t = []  # holds the backpointers for this step
166             viterbivars_t = []  # holds the viterbi variables for this step
167 
168             for next_tag in range(self.tagset_size):
169                 # next_tag_var[i] holds the viterbi variable for tag i at the
170                 # previous step, plus the score of transitioning
171                 # from tag i to next_tag.
172                 # We don't include the emission scores here because the max
173                 # does not depend on them (we add them in below)
174                 next_tag_var = forward_var + self.transitions[next_tag]
175                 best_tag_id = argmax(next_tag_var)
176                 bptrs_t.append(best_tag_id)
177                 viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
178             # Now add in the emission scores, and assign forward_var to the set
179             # of viterbi variables we just computed
180             forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
181             backpointers.append(bptrs_t)
182 
183         # Transition to STOP_TAG
184         terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
185         best_tag_id = argmax(terminal_var)
186         path_score = terminal_var[0][best_tag_id]
187 
188         # Follow the back pointers to decode the best path.
189         best_path = [best_tag_id]
190         for bptrs_t in reversed(backpointers):
191             best_tag_id = bptrs_t[best_tag_id]
192             best_path.append(best_tag_id)
193         # Pop off the start tag (we dont want to return that to the caller)
194         start = best_path.pop()
195         assert start == self.tag_to_ix[START_TAG]  # Sanity check
196         best_path.reverse()
197         return path_score, best_path
198 
199     def neg_log_likelihood(self, sentence, tags):
200         feats = self._get_lstm_features(sentence)
201         forward_score = self._forward_alg(feats)
202         gold_score = self._score_sentence(feats, tags)
203         return forward_score - gold_score
204 
205     def forward(self, sentence):  # dont confuse this with _forward_alg above.
206         # Get the emission scores from the BiLSTM
207         lstm_feats = self._get_lstm_features(sentence)
208 
209         # Find the best path, given the features.
210         score, tag_seq = self._viterbi_decode(lstm_feats)
211         return score, tag_seq
212 
213 
214 
215 
216 
217 #####util
218 
219 def argmax(vec):
220     # return the argmax as a python int
221     _, idx = torch.max(vec, 1)
222     return idx.item()
223 
224 
225 def prepare_sequence(seq, to_ix):
226     idxs = [to_ix[w] for w in seq]
227     return torch.tensor(idxs, dtype=torch.long)
228 
229 
230 # Compute log sum exp in a numerically stable way for the forward algorithm
231 def log_sum_exp(vec):
232     max_score = vec[0, argmax(vec)]
233     max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
234     return max_score + 
235         torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))