综合练习:词频统计

f = open("peng.txt", "r", encoding='utf-8')
song = f.read()
f.close()

sep = ''',.?—!"'''

exclude = {'the', 'and', 'i', 'in', "i'm", 'a', 'of', 'an', 'on', 'to', 'with'}

for c in sep:
    song = song.replace(c, ' ')

swl = song.lower().split()

swd = {}

sws = set(swl) - exclude

for w in sws:
    swd[w] = swl.count(w)

fl = list(swd.items())

fl.sort(key=lambda x: x[1], reverse=True)

for i in fl:
    print(i)

f = open("result.txt", "w")
for i in range(20):
    f.write(fl[i][0] + "  " + str(fl[i][1]) + "
")
f.close()

  

import jieba

f = open('weicheng.txt', 'r', encoding='utf-8')
text = f.read()
f.close()

p = ''',。‘’“”:;()!?、 '''
a = {
    '的', '
', 'u3000',
    '曰', '之', '不', '人', '一', '大', '马', '来', '有', '于', '下', '此',
}
for i in p:
    text = text.replace(i, '')
print(list(jieba.cut(text)))
t = list(jieba.lcut(text))
print(t)
count = {}
wl = list(set(t) - a)
print(wl)

for i in range(0, len(wl)):
    count[wl[i]] = text.count(str(wl[i]))

cl = list(count.items())
cl.sort(key=lambda x: x[1], reverse=True)
print(cl)

f = open('wcCount.txt', 'a')
for i in range(20):
    f.write(cl[i][0] + ':' + str(cl[i][1]) + '
')
f.close()

  

原文地址:https://www.cnblogs.com/phoenlix/p/8666515.html