jieba 分词 hamlet

 1 def getTaxt():
 2     txt=open('hamlet.txt')
 3     txt = txt.lower()
 4     for ch in '!"#$%&()*+,-./:;<=>?@[\]^_‘{|}~':
 5         txt = txt.replace(ch, " ")   #将文本中特殊字符替换为空格
 6     return txt
 7 
 8 hamletTxt = getText()
 9 words  = hamletTxt.split()
10 counts = {}
11 for word in words:            
12     counts[word] = counts.get(word,0) + 1
13 items = list(counts.items())
14 items.sort(key=lambda x:x[1], reverse=True) 
15 for i in range(10):
16     word, count = items[i]
17     # print ("{0:<10}{1:>5}".format(word, count))  输出出现最多的10个单词和其出现次数
18     print (word,count)  #输出出现最多的10个单词

 词云表示

 1 import jieba
 2 import wordcloud
 3 import matplotlib.pyplot as plt
 4 f = open("D:\360安全浏览器下载\hamlet.txt", "r", encoding="utf-8")#文件路经每个人都不一样,此程序可以在jupyter上运行
 5  
 6 t = f.read()
 7 f.close()
 8 ls = jieba.lcut(t)
 9  
10 txt = " ".join(ls)
11 w = wordcloud.WordCloud( 
12     width = 1000, height = 700,
13     background_color = "pink",
14     font_path = "msyh.ttc"    
15     )
16 myword=w.generate(txt)
17 plt.imshow(myword)
18 plt.axis("off")
19 plt.show()
20 w.to_file("g.png")

原文地址:https://www.cnblogs.com/ghh0/p/12642264.html