中文词频统计

import jieba
f=open('new.txt','r',encoding='utf-8')
new=f.read()

#关闭文件流
f.close()
#删除数字和标点符号
str = '''1234567890一!!“”,。?、;’"',.、:()()
‘’'''
for i in str:
    new=new.replace(i," ")

NEW=list(jieba.lcut(new))
exclude = ['','','','','','','','','','','','','','','他们','','','','','','','',
           '','','可是','','','','','','一个',' ','','','一点','','',
           '没有','','','','','','','','u3000','','','']

dictionary={}
for i in NEW:
    #只出现一次一般没有意义
    if NEW.count(i)==1:
        continue
    else:
        dictionary[i]=NEW.count(i)

#删除助词
for i in exclude:
     if i in dictionary.keys():
         del dictionary[i]
    #排序
dictionary=sorted(dictionary.items(),key=lambda item:item[1],reverse=True)
for i in range(19):
    print(dictionary[i])

运行结果:

('工会', 17)
('日', 16)
('月', 12)
('清明节', 11)
('经费', 10)
('不准', 8)
('元', 7)
('将', 7)
('上调', 6)
('节日', 6)
('假期', 6)
('规定', 5)
('基层', 5)
('号', 5)
('汽油', 5)
('每升', 4)
('福利', 4)
('标准', 4)
('发放', 4)

原文地址:https://www.cnblogs.com/1103a/p/8666407.html