python自然语言处理——2.2 条件频率分布

微信公众号:数据运营人
本系列为博主的读书学习笔记,如需转载请注明出处。

第二章 获取文本预料和词汇资源

2.2 条件频率分布条件和事件按文体计数词汇绘制分布图和分布表使用双连词生成随机文本

2.2 条件频率分布

条件和事件
1text = ['The','Fulton','County','Grand','Jury','said']         
2pairs = [('news','The'),('news','Fulton'),('news','County')]   
按文体计数词汇
 1from nltk.corpus import brown                                      
2cfd = nltk.ConditionalFreqDist(                                    
3         (genre,word)                                               
4     for genre in brown.categories()                                
5     for word in brown.words(categories = genre) )                                                                  
6print(len(cfd))                                                                                                         
7genre_word= [                                                      
8     (genre,word)                                                   
9     for genre in ['news','romance']                                
10     for word in brown.words(categories = genre)]                                                                  
11print(len(genre_word))                                             
12print(genre_word[:4])                                              
13print(genre_word[-4:])                                             
14cfd = nltk.ConditionalFreqDist(genre_word)                         
15print(cfd)                                                         
16print(cfd.conditions())                                            
17print(cfd["news"])                                                 
18print(cfd["romance"])                                              
19# print(list(cfd["romance"]))                                        
20print(cfd["romance"]["could"])  

返回结果:

绘制分布图和分布表
 1from nltk.corpus import inaugural                                                 
2cfd = nltk.ConditionalFreqDist(                                                   
3        (target,fileid[:4])                                                       
4    for fileid in inaugural.fileids()                                             
5    for w  in  inaugural.words()                                                  
6    for target in ["america","citizen"]                                           
7    if w.lower().startswith(target))                                              
8print(cfd.conditions())                                                           
9print(cfd["citizen"])                                                             
10print(list(cfd["citizen"]))                                                       
11# 如果输出结果为<FreqDist with 56 samples and 17976 outcomes>,设置成列表                      
12print(list(cfd["america"]))                                                       
13print(cfd["citizen"]["america"])                                                  
14
15from nltk.corpus import udhr                                                      
16languages = ["Chickasaw","English","German_Deutsch"]                              
17cfd = nltk.ConditionalFreqDist(                                                   
18        (lang,len(word))                                                          
19         for lang in languages                                                    
20         for word in udhr.words(lang + '-Latin1'))                                
21print(cfd.tabulate(conditions = ["English","Chickasaw"],                          
22                   samples = range(10),cumulative = True))                        

返回结果:

使用双连词生成随机文本
 1sent = ['In','the','beginning','God','created','the','heaven']                         
2print(list(nltk.bigrams(sent)))                                                        
3
4def generate_model(cfdist,word,num=15):                                                
5    for i in range(num):                                                               
6        print(word)                                                                    
7        word = cfdist[word].max()                                                      
8text = nltk.corpus.genesis.words("english-kjv.txt")                                    
9bigrams = nltk.bigrams(text)                                                           
10cfd = nltk.ConditionalFreqDist(bigrams)                                                
11print(cfd["living"])                                                                   
12print(generate_model(cfd,"living"))                                                    

返回结果:

原文地址:https://www.cnblogs.com/ly803744/p/10082886.html