5 分类和标注词汇

1.词性标注器 parts-of-speech 或 POS tagger nltk自带英文标注器

import nltk
text = nltk.word_tokenize("And now for something completely different")#list
print(nltk.pos_tag(text))
print(nltk.help.upenn_tagset('RB'))#标记的文档
text1 = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
print(type(text1))#<class 'nltk.text.Text'>
text1.similar('bought')

 结果如下：

[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')]
RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...
None
<class 'nltk.text.Text'>
made said done put had seen found given left heard was been brought
set got that took in told felt

2.标注语料库

表示已经标注的标识符tagged_token = nltk.tag.str2tuple('fly/NN')#表示已经标注的标识符print(tagged_token)#('fly', 'NN')

sent = """the/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN other/AP topics/NNS."""
res = [nltk.tag.str2tuple(t) for t in sent.split()]
print(res)
结果如下：
('fly', 'NN')
[('The', 'AT'), ('grand', 'JJ'), ('jury', 'NN'), ('commented', 'VBD'),
('on', 'IN'), ('a', 'AT'), ('number', 'NN'), ('of', 'IN'), ('other', 'AP'), ('topics', 'NNS.')]

读取已经标注的语料库print(nltk.corpus.brown.tagged_words())

#[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

print(nltk.corpus.indian)
from nltk.corpus import brown
browd_news_tagged = brown.tagged_words(categories = 'news',tagset = 'universal')
print(browd_news_tagged)
tag_fd = nltk.FreqDist(tag for (word,tag) in browd_news_tagged)
print(tag_fd.keys())
tag_fd.plot(cumulative = True)#频率分布图

结果如下：
[('The', 'AT'), ('Fulton', 'NP-TL'), ...]
<IndianCorpusReader in '.../corpora/indian' (not loaded yet)>
[('The', 'DET'), ('Fulton', 'NOUN'), ...]
dict_keys(['DET', 'NOUN', 'ADJ', 'VERB', 'ADP', '.', 'ADV', 'CONJ', 'PRT', 'PRON', 'NUM', 'X'])

名词、动词、形容词等

名词

word_tag_pairs = nltk.bigrams(browd_news_tagged)
print(word_tag_pairs)
res1 = list(nltk.FreqDist(a[1] for (a,b) in word_tag_pairs if b[1] == 'N'))
print(res1)

from nltk.corpus import brown
word_tag = nltk.FreqDist(brown.tagged_words(categories="news"))
print([word+'/'+tag for (word,tag)in word_tag if tag.startswith('V')])
wsj = brown.tagged_words(categories="news")cfd = nltk.ConditionalFreqDist(wsj)print(cfd['money'].keys()) 

结果如下：
<generator object bigrams at 0x00000240463ABE08>
[]
['said/VBD', 'produced/VBD', 'took/VBD', 'deserves/VBZ', 'conducted/VBN', 'charged/VBN',...]
dict_keys(['NN'])

动词

word_tag_fd = nltk.FreqDist(wsj)
res2 = [word + '/' + tag for (word,tag) in word_tag_fd if tag.startswith('V')]
print(res2)

wsj = brown.tagged_words()
cfd1 = nltk.ConditionalFreqDist(wsj)
print(cfd1['money'].keys())
print(cfd1.conditions())#所有的单词

结果如下：
['said/VBD', 'produced/VBD', 'took/VBD', 'deserves/VBZ', 'conducted/VBN', 'charged/VBN',...]
dict_keys(['NN', 'NN-HL'])
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's",...]

# 找到同是VD和VN的词汇
res2 = [w for w in cfd1.conditions() if 'VD' in cfd1[w] and 'VN' in cfd1[w]]
# print(res2)
idx1 = wsj.index(('kicked','VD'))
print(idx1)

尝试找出每个名词类型中最频繁的名词

def findtags(tag_prefix,tagged_text):
 cfd = nltk.ConditionalFreqDist((tag,word) for (word,tag)
in tagged_text if tag.startswith(tag_prefix))
# print(cfd.conditions())#['NN-TL', 'NN', 'NNS', 'NN-HL', 'NN$-TL', 'NN$', 'NNS-HL', 'NNS-TL', 'NNS$', 'NNS$-TL', 'NN-TL-HL', 'NNS-TL-HL', 'NN$-HL', 'NNS$-HL', 'NN-NC']
 return dict((tag,list(cfd[tag].keys())[:5]) for tag in cfd.conditions())
tagdict = findtags('NN',nltk.corpus.brown.tagged_words(categories = 'news'))
print(tagdict)#{'NN-TL': ['County', 'Jury', 'City', 'Committee', 'Court'], 'NN': ['investigation', 'primary', 'election', 'evidence', 'place'],...}
for tag in sorted(tagdict):
print(tag,tagdict[tag])

结果如下：
{'NN-TL': ['County', 'Jury', 'City', 'Committee', 'Court'],
'NN': ['investigation', 'primary', 'election', 'evidence', 'place'],
'NNS': ['irregularities', 'presentments', 'thanks', 'reports', 'voters'],...]
NN ['investigation', 'primary', 'election', 'evidence', 'place']
NN$ ["ordinary's", "court's", "mayor's", "wife's", "governor's"]
NN$-HL ["Golf's", "Navy's"]
NN$-TL ["Department's", "Commissioner's", "President's", "Party's", "Mayor's"]

探索已经标注的语料库

brown_learned_text = brown.words(categories = 'learned')
res3 = sorted(set(b for (a,b) in nltk.bigrams(brown_learned_text) if a == 'often'))
# print(res3)#[',', '.', 'accomplished', 'analytically', 'appear', 'apt',...]brown_irnd_tagged = brown.tagged_words(categories = 'learned')
tags = [b[1] for (a,b) in nltk.bigrams(brown_irnd_tagged) if a[0] == 'often']
print(tags)
fd = nltk.FreqDist(tags)
fd.tabulate()

结果如下：
['AP', 'QL', 'VB', 'VBD', 'JJ', ',', 'VB', 'VBN', 'VBN', 'VBD', ',', 'VBN',
'CS', 'VBN', 'VB', 'VBN', 'VBG', 'IN', 'QL', 'RP', 'VBD', 'VBD', 'RB', 'VB',
'VBD', 'VB', 'VBD', 'CS', ',', 'CS', 'VBN', 'VB', 'RB', 'VB', 'QL', 'JJ', 'IN',
'RB', 'VBN', 'JJ', 'VBZ', 'VBN', 'VBN', 'VB', 'VBN', 'QLP', 'BEN', 'VBD', 'JJ',
'VBD', 'IN', 'IN', 'WRB', 'VB', '.', 'TO', 'VB', 'VBN', 'VBN', 'VBN', 'JJ', 'VBN',
'VBN', 'HV']
VBN VB VBD JJ IN QL , CS RB AP VBG RP VBZ QLP BEN WRB . TO HV
15 10 8 5 4 3 3 3 3 1 1 1 1 1 1 1 1 1 1

# 使用POS标记寻找三词短语

from nltk.corpus import brown
def process(sentence):
for (w1,t1),(w2,t2),(w3,t3) in nltk.trigrams(sentence):
if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
print(w1,w2,w3)
for tagged_sent in brown.tagged_sents():
 process(tagged_sent)
brown_news_tagged = brown.tagged_words(categories = 'news')
data = nltk.ConditionalFreqDist((word.lower(),tag) for (word,tag) in brown_news_tagged)
# print(data.conditions())#['the', 'fulton', 'county', 'grand']
for word in data.conditions():
if len(data[word]) > 3:
 tags = data[word].keys()
print(word,' '.join(tags))# no AT RB AT-HL AT-TL
 # that CS WPS DT QL WPO

    结果如下：
combined to achieve
continue to place
serve to protect
wanted to wait
allowed to place
expected to become ...

使用Python字典映射词及其属性
frequency = nltk.defaultdict(int)
print(frequency['color'])#默认为0alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
vocab = nltk.FreqDist(alice)
v1000 = sorted(list(vocab))[:1000]
mapping = nltk.defaultdict(lambda : 'UNK')
for v in v1000:
 mapping[v] = v
 alice2 = [mapping[v] for v in alice]
 print(alice2[:100])

结果如下：
0
['[', 'Alice', "'", 'UNK', 'Adventures', 'UNK', 'Wonderland', 'by', 'Lewis', 'Carroll', '1865', ']', 'CHAPTER', 'I', '.', 'Down', 'UNK', 'Rabbit', '-', 'Hole', 'Alice', 'UNK', 'beginning', 'UNK',
'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'by', 'UNK', 'UNK', 'UNK', 'UNK', 'bank', ',', 'and', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', ':', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'book', 'UNK',
'UNK', 'UNK', 'UNK', ',', 'but', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', ',', "'", 'and', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'a', 'book', ",'", 'UNK', 'Alice', "'", 'UNK', 'UNK',
'UNK', 'conversation', "?'", 'So', 'UNK', 'UNK', 'considering', 'UNK', 'UNK', 'UNK', 'UNK', '(', 'as', 'UNK', 'as', 'UNK', 'UNK', ',']

递增地更新字典
counts = nltk.defaultdict(int)
from nltk.corpus import brown
for (word,tag) in brown.tagged_words(categories = 'news'):
 counts[tag] += 1
print(counts)

from operator import itemgetter
res5 = sorted(counts.items(), key=itemgetter(1),reverse=True)#按字典的value排序
res6 = [t for t,c in sorted(counts.items(),key=itemgetter(1),reverse=True)]
print(res6)#['NN', 'IN', 'AT', 'NP',...]

结果如下：
defaultdict(<class 'int'>, {'AT': 8893, 'NP-TL': 741, 'NN-TL': 2486, 'JJ-TL': 689, 'VBD': 2524,
'NR': 495, 'NN': 13162,...}

#通过最后两个字母索引词汇
last_letters = nltk.defaultdict(list)
words = nltk.corpus.words.words('en')
for word in words:
 key = word[-2:]
 last_letters[key].append(word)
print(last_letters['ly'])

anagrams = nltk.defaultdict(list)
for word in words:
 key = ''.join(sorted(word))
 anagrams[key].append(word)
print(anagrams['aeilnrt'])#输出所有排序之后为‘aeilnrt’的单词
#复杂的键和值
pos = nltk.defaultdict(lambda : nltk.defaultdict(int))
brown_news_tagged = brown.tagged_words(categories = 'news')
for ((w1,t1),(w2,t2)) in nltk.bigrams(brown_news_tagged):
 pos[(t1,w2)][t2] += 1
print(pos[('DET','right')])
#颠倒字典
counts = nltk.defaultdict(int)for word in nltk.corpus.gutenberg.words('milton-paradise.txt'): counts[word] += 1
res7 = [key for (key,value) in counts.items() if value == 32]print(res7)pos.update({'cats':'N'})pos2 = nltk.defaultdict(list)for key,value in pos.items(): pos2[value].append(key)print(pos2['N'])pos3 = nltk.Index((value,key) for (key,value) in pos.items())print(pos3['N'])

3.自动标注

默认标注器

from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories = 'news')
brown_sents = brown.sents(categories = 'news')
# 默认标注器
tags = [tag for (word,tag) in brown.tagged_words(categories = 'news')]
print(nltk.FreqDist(tags).max())#NN
raw = 'I do not like eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')#将所有词都变成NN的标注器
print(default_tagger.tag(tokens))#调用tag()方法进行标注
print(default_tagger.evaluate(brown_tagged_sents))#0.13089484257215028 用evaluate()进行检验

正则表达式标注器

#正则表达式标注器 注意这里规则是固定（由自己决定）。当规则越来越完善的时候，精确度越高。
patterns = [
 (r'.*ing$','VBG'),
 (r'.*ed$','VBD'),
 (r'.*es$','VBZ'),
 (r'.*','NN')
]
regexp_tagger = nltk.RegexpTagger(patterns)
res8 = regexp_tagger.tag(brown_sents[3])#[('``', None), ('Only', None), ('a', None),...]res9 = regexp_tagger.evaluate(brown_tagged_sents)#0.030152952642361317

查询标注器

#查询标注器
fd = nltk.FreqDist(brown.words(categories = 'news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories = 'news'))
most_freq_words = list(fd.keys())[:100]
likely_tags = dict((word,cfd[word].max()) for word in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
res10 = baseline_tagger.evaluate(brown_tagged_sents)#0.3329355371243312
# print(res10)sent = brown.sents(categories = 'news')
# print(baseline_tagger.tag(sent))baseline_tagger = nltk.UnigramTagger(model=likely_tags,backoff=nltk.DefaultTagger('NN'))#回退 如果不能指定标记就使用默认标注器
res11 = baseline_tagger.evaluate(brown_tagged_sents)
print(res11)
#查找标注器的性能，使用不同大小的模型
def performance(cfd,wordlist):
 lt = dict((word,cfd[word].max()) for word in wordlist)
 baseline_tagger = nltk.UnigramTagger(model=lt,backoff=nltk.DefaultTagger('NN'))
return baseline_tagger.evaluate(brown.tagged_sents(categories = 'news'))
def display():import pylab words_by_freq = list(nltk.FreqDist(brown.words(categories = 'news'))) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories = 'news')) sizes = 2 ** pylab.arange(15) perfs = [performance(cfd,words_by_freq[:size]) for size in sizes] pylab.plot(sizes,perfs,'-bo') pylab.title('Lookup Tagger Performance with Varying Model Size') pylab.xlabel('Model Size') pylab.ylabel('Performance') pylab.show()# display()

4.N-gram标注

基础的一元标注器

#一元标注器(Unigram Tagging)
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories = 'news')
brown_sents = brown.sents(categories = 'news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
res12 = unigram_tagger.tag(brown_sents[2007])
print(res12)#[('Various', 'JJ'), ('of', 'IN'), ('the', 'AT'),size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
res13 = unigram_tagger.evaluate(test_sents)
print(res13)

一般的N-gram标注器

#一般的N-gram标注
bigram_tagger = nltk.BigramTagger(train_sents)
res14 = bigram_tagger.tag(brown_sents[2007])
print(res14)#[('Various', 'JJ'), ('of', 'IN'), ('the', 'AT'),...]
# bigram标注器能够标注训练中它看到过的句子中的所有词，但对于一个没见过的句子却不行，只要遇到一个新词，就无法给它分配
unseen_sent = brown_sents[1006]
res15 = bigram_tagger.tag(unseen_sent)
print(bigram_tagger.evaluate(test_sents))#0.10206319146815508
print(res15)#[('A', 'AT'), ('capsule', 'NN'),
t0 = nltk.DefaultTagger('NN')t1 = nltk.UnigramTagger(train_sents,backoff=t0)t2 = nltk.BigramTagger(train_sents,backoff=t1)res16 = t2.evaluate(test_sents)print(res16)#0.8452108043456593
t3 = nltk.TrigramTagger(train_sents,backoff=t2)res17 = t3.evaluate(test_sents)print(res17)#0.843317053722715
#存储标注器
from pickle import dumpoutput = open('t2.pkl','wb')dump(t2,output,-1)output.close()
from pickle import loadinput_ = open('t2.pkl','rb')tagger = load(input_)input_.close()

text = """The board's actions shows what free enterprise is up against in our complex maze of regulatory laws."""
tokens = text.split()res18 = tagger.tag(tokens)print(res18) #[('The', 'AT'), ("board's", 'NN$'), ('actions', 'NNS'),]
#性能限制
cfd = nltk.ConditionalFreqDist(((x[1],y[1],z[0],z[1])for sent in brown_tagged_sents   for x,y,z in nltk.trigrams(sent)))      ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]res19 = sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()print(res19)

组合标注器

t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents,backoff=t0)
t2 = nltk.BigramTagger(train_sents,backoff=t1)
res16 = t2.evaluate(test_sents)
print(res16)#0.8452108043456593
t3 = nltk.TrigramTagger(train_sents,backoff=t2)
res17 = t3.evaluate(test_sents)
print(res17)#0.843317053722715

跨句子边界标注

#跨句子边界标注：使用已标注句子的链表来训练 对于句首的单词，没有前n个单词。解决方法：通过已标记的tagged_sents来训练标注器

brown_tagged_sents = brown.tagged_sents(categories = 'news')
brown_sents = brown.sents(categories = 'news')
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents,backoff=t0)
t2 = nltk.BigramTagger(train_sents,backoff=t1)
res16 = t2.evaluate(test_sents)
print(res16)#0.8452108043456593

5.基于转换的标注：Brill标注器

# 较上面的都优秀。实现的思路：以大笔化开始，然后修复细节，一点点进行细致改变。
# 不仅占用内存小，而且关联上下文，并且根据问题的变小，实时修正错误，而不是一成不变的
from nltk.tag import brill
print(brill.nltkdemo18plus())
brill.nltkdemo18()