10.15作业

1.英文

#读取
with open('steve.txt','r',encoding='utf-8')as f:
    novel = f.read()

#清洗数据
    sep = " .!@#%&*;:',.?/_“’”"
    for ch in sep:
        novel=novel.replace(ch,' ')

#字母换成小写
    novel = novel.lower()
strnovel = novel.split()
print(strnovel,len(strnovel))

#分词后转为集合
strset = set(strnovel)
noMean = {'is','and','a','this','the','a','in','at','on','to','s','his','3','1983'}
strset = strset - noMean
print(strset,len(strset))

#将集合中词统计出现次数
strdict={}
for word in strset:
    strdict[word] = strnovel.count(word)
print(strdict,len(strdict))
wordlist = list(strdict.items())

#排序
wordlist.sort(key=lambda x:x[1],reverse=True)
print(wordlist)

#输入TOP20
for i in range(20):
    print(wordlist[i])

2.中文小说

#-*- coding:utf-8 -*-
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba



with open('doupo.txt','r',encoding='utf-8') as f:
    doupo = f.read()

#清洗
sep = " ,.?;:'!*#-_"
for quchu in sep:
    doupo = doupo.replace(quchu,' ')
#分词
wordList = jieba.cut(doupo)
print(type(wordList))
#词频分析
data={}
for word in wordList:
    if len(word) == 1:
        continue
    else:data[word] = data.get(word,0)+1

result = list(data.items())
result.sort(key=lambda x:x[1],reverse=True)
for top_20 in range(20):
    print(result[top_20])

wordSplit = " ".join(dict(result))
wc = WordCloud(background_color="black",  # 设置背景颜色
               # mask = "图片",  #设置背景图片
               max_words=2000,  # 设置最大显示的字数
               # stopwords = "", #设置停用词
               font_path="‪C:\Windows\Fonts\NotoSansHans-Black_0.otf",
               # 设置中文字体，使得词云可以显示（词云默认字体是“DroidSansMono.ttf字体库”，不支持中文）
               max_font_size=40,
              # 设置字体最大值
               random_state=30,  # 设置有多少种随机生成状态，即有多少种配色方案
               )
mywc = wc.generate(wordSplit)  # 生成词云

# 展示词云图
plt.imshow(mywc)
plt.axis("off")
plt.show()
wc.to_file('myword.jpg')  # 保存图片文件