【python】jieba分词,去停用词,自定义字典

使用jieba分词,去停用词,添加自定义字典。

#encoding=utf-8
import jieba
filename = "gp.txt"
stopwords_file = "stopwords.txt"
jieba.load_userdict("dict.txt")
stop_f = open(stopwords_file,"r",encoding='utf-8')
stop_words = list()
for line in stop_f.readlines():
    line = line.strip()
    if not len(line):
        continue

    stop_words.append(line)
stop_f.close

#print(len(stop_words))

f = open(filename,"r",encoding='utf-8')
result = list()
for line in f.readlines():
    line = line.strip()
    if not len(line):
        continue
    outstr = '' 
    seg_list = jieba.cut(line,cut_all=False) 
    for word in seg_list:  
        if word not in stop_words:  
            if word != '	':  
                outstr += word 
                outstr += " "  
   # seg_list = " ".join(seg_list)
    result.append(outstr.strip())
f.close

with open("gp2.txt","w",encoding='utf-8') as fw:
    for sentence in result:
        sentence.encode('utf-8')
        data=sentence.strip()  
        if len(data)!=0:  
            fw.write(data)
            fw.write("
") 


print ("end")
原文地址:https://www.cnblogs.com/helenlee01/p/12617477.html