python学习_新增了一个jieba库和wordcloud文件生成词云

版本:

  新增了一个jieba库和wordcloud文件生成词云

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
__author__ = '王益夫'
__mtime__ = '2019/12/20'
'''
'''
版本修改:
V 1.0:get代码获取的文本内容,通过jieba库和词云进行分析
'''
import jieba
from wordcloud import WordCloud
from os import path
import re
import matplotlib.pyplot as plt
#from scipy.misc import imread
import imageio

file_path = path.dirname(__file__) + r'/temp'
file_name1 = r'新闻联播.txt'
file_name2 = r'StopWords.txt'
file_name3 = r'AddWords.txt'

TextPath = file_path + '/' + file_name1
StopWordsPath = file_path + '/' + file_name2
AddWordsPath = file_path + '/' + file_name3
print(AddWordsPath)


def jiebaclearText(text):
    mywordslist = []
    seg_list = jieba.cut(text, cut_all=False)
    #seg_list = jieba.cut(TestStr, cut_all=True)    全模式:该模式将语料中所有可以组合成词的词语都构建出来,其优点是速度非常快,缺点是不能解决歧义问题,并且分词结果不太准确。
    #seg_list = jieba.cut(TestStr, cut_all=False)   默认模式:该模式利用其算法将句子最精确地分隔开,适合文本分析,通常采用这种模式进行中文分词。
    #seg_list = jieba.cut_for_search(TestStr)       搜索引擎模式:该模式是在精确模式基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
    liststr = "/".join(seg_list)
    f_stop = open(StopWordsPath, encoding='utf-8', errors='ignore')
    try:
        f_stop_text = f_stop.read()
    finally:
        f_stop.close()

    f_stop_seg_list = f_stop_text.split('
')
    for myword in liststr.split('/'):
        if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
            mywordslist.append(myword)
    return ' '.join(mywordslist)

def addWordsRulls(text):
    addwords_list = set()
    try:
        results = re.findall('《[^》]+》', text)
        for result in results:
            addwords_list.add(result)
            #jieba.add_word(result)
        return True
    except Exception as e:
        raise e
        addwords_list.add('EOR:ADD正则解析失败,未获取关键词!')
        return False
    finally:
        with open(AddWordsPath, 'a+', encoding='utf-8', errors='ignore') as file_add:
            for line in list(addwords_list):
                file_add.write(line + '
')

def StopWordsRulls(text):
    Stopwords_list = set()
    try:
        results = re.findall('d{4}年d{1,2}月d{1,2}日', text)
        for result in results:
            print(result)
            Stopwords_list.add(result)
            #jieba.add_word(result)
        return True
    except Exception as e:
        raise e
        Stopwords_list.add('EOR:Stop正则解析失败,未获取关键词!')
        return False
    finally:
        with open(StopWordsPath, 'a+', encoding='utf-8', errors='ignore') as file_Stop:
            for line in list(Stopwords_list):
                file_Stop.write(line + '
')

def main():


    with open(TextPath, encoding='utf-8', errors='ignore') as file_Text:
        text = file_Text.read()
    # for key in analyse.extract_tags(text, 50, withWeight=False):
    #     print(key)

    if addWordsRulls(text) and StopWordsRulls(text):
        with open(AddWordsPath, 'r', encoding='utf-8', errors='ignore') as file_read:
            context = set(file_read.read())
            for line in context:
                jieba.add_word(line)

    text_text = jiebaclearText(text)

    color_mask = imageio.imread(file_path + "/template.jpeg")
    cloud = WordCloud(
        # 设置字体,不指定就会出现乱码
        font_path="./temp/HYQiHei-25J.ttf",
        # font_path=path.join(d,'simsun.ttc'),
        # 设置背景色
        background_color='white',
        # 词云形状
        mask=color_mask,
        # 允许最大词汇
        max_words=200,
        # 最大号字体
        max_font_size=40
    )
#    wordcloud = WordCloud(background_color="white", width=1000, height=860, margin=2).generate(text_text)
    word_cloud = cloud.generate(text_text)  # 产生词云
    word_cloud.to_file("test.jpg")  # 保存图片
    #  显示词云图片
    plt.imshow(word_cloud)
    plt.axis('off')
    plt.show()

if __name__ == '__main__':
    main()

  

原文地址:https://www.cnblogs.com/wyf-349/p/12124193.html