综合练习:词频统计


一、英文词频统计


1
f=open("text.txt","r") 2 song=f.read() 3 f.close() 4   5 sep=''',.?—!"''' 6   7 exclude={'the','and','i','in',"i'm",'a','of','an','on','to','with'} 8   9 for c in sep: 10     song=song.replace(c,' ') 11   12 swl=song.lower().split() 13   14 swd={} 15   16 sws=set(swl)-exclude 17   18 for w in sws: 19     swd[w]=swl.count(w) 20   21 fl=list(swd.items()) 22   23 fl.sort(key = lambda x:x[1],reverse = True) 24   25 for i in fl: 26     print(i) 27   28 f=open("result.txt","w") 29 for i in range(20): 30     f.write(fl[i][0]+"  "+str(fl[i][1])+" ") 31 f.close()

二、中文词频统计

 1 import jieba
 2  
 3 f = open('xiyouji.txt','r', encoding='utf-8')
 4 text = f.read()
 5 f.close()
 6  
 7  
 8 import jieba
 9   
10 #打开文件
11 file = open("zgsjtl.txt",'r',encoding="utf-8")
12 notes = file.read();
13 file.close();
14   
15 #替换标点符号
16 sep = ''':。,?!;∶ ...“”'''
17 for i in sep:
18     notes = notes.replace(i,' ');
19   
20 notes_list = list(jieba.cut(notes));
21   
22   
23  
24 exclude =[' ','
','','','','','','','','','','','','','']
25   
26 notes_dict={}
27 for w in notes_list:
28     notes_dict[w] = notes_dict.get(w,0)+1
29   
30 for w in exclude:
31     del (notes_dict[w]);
32   
33 for w in notes_dict:
34     print(w,notes_dict[w])
35   
36   
37  
38 dictList = list(notes_dict.items())
39 dictList.sort(key=lambda x:x[1],reverse=True);
40 print(dictList)
41   
42  
43 for i in range(20):
44     print(dictList[i])
45  
46 outfile = open("top20.txt","a")
47 for i in range(20):
48     outfile.write(dictList[i][0]+" "+str(dictList[i][1])+"
")
49 outfile.close();
原文地址:https://www.cnblogs.com/Fanchuguang/p/8666654.html