python对文章词频的统计

import os
import re

from nltk import ne_chunk, pos_tag, word_tokenize
import nltk
from docx import Document
import langid
import pandas as pd


def readWord():
    text = ""
    rootdir = 'C:\Users\Administrator\Desktop\一季度'
    list = os.listdir(rootdir)  # 列出文件夹下所有的目录与文件
    for i in range(0, len(list)):
        path = os.path.join(rootdir, list[i])
        print(path)
        document = Document(path)
        # 获取所有段落
        all_paragraphs = document.paragraphs
        for paragraph in all_paragraphs:
            if langid.classify(paragraph.text)[0] == 'en':
                text += paragraph.text + "
"
    return text


def get_entities():
    obj = {}
    arr = []
    # 对文章分词
    # sentence = "I am named John Doe  AI AI AI AI"
    sentence = readWord()

    obj = {}
    tagged_sentence = ne_chunk(pos_tag(word_tokenize(sentence)))
    for tagged in tagged_sentence:
        if len(tagged) == 2:
            # print(tagged[1])
            pattern = re.compile("’|”|—|[|…|/|s|P|II|R|A|]")
            if pattern.findall(tagged[0][0]) and (tagged[1] == "NNP" or tagged[1] == "NNPS"):
                # if (tagged[1] == "NNP" or tagged[1] == "NNPS") and tagged[0] != "’" and tagged[0][0] != "”" and tagged[0][0] != "—" and tagged[0][0] != "[" and tagged[0][0] != "]" and tagged[0][0] != "…" and tagged[0][0] != "@" and tagged[0][0] != "/" and tagged[0][0] != "s" and tagged[0][0] != "|" and tagged[0][0] != "|" and tagged[0][0] != "P" and tagged[0][0] != "•" and tagged[0][0] != "II" and tagged[0][0] != "R" and tagged[0][0] != "A" and tagged[0][0] != "“" and tagged[0][0] != "‘" and tagged[0][0] != "–":

                if obj.get(tagged[0]) is not None:
                    obj[tagged[0]] += 1
                else:
                    obj[tagged[0]] = 1
        else:
            # print(tagged)
            # print(tagged[0])
            if len(tagged[0]) == 2:
                # print(tagged[1])
                if (tagged[0][1] == "NNP" or tagged[0][1] == "NNPS") and tagged[0][0] != "’" and tagged[0][0] != "”" and 
                        tagged[0][0] != "—" and tagged[0][0] != "[" and tagged[0][0] != "]" and tagged[0][0] != "…" and 
                        tagged[0][0] != "@" and tagged[0][0] != "/" and tagged[0][0] != "s" and tagged[0][0] != "|" and 
                        tagged[0][0] != "|" and tagged[0][0] != "P" and tagged[0][0] != "•" and tagged[0][0] != "II" and 
                        tagged[0][0] != "R" and tagged[0][0] != "A" and tagged[0][0] != "“" and tagged[0][0] != "‘" and 
                        tagged[0][0] != "–":
                    if obj.get(tagged[0][0]) is not None:
                        obj[tagged[0][0]] += 1
                    else:
                        obj[tagged[0][0]] = 1

    # # tagged_sentence = nltk.tag.pos_tag(sentence.split())
    # tagged_sentence = ne_chunk(pos_tag(word_tokenize(sentence)))
    # # print(ne_chunk(pos_tag(word_tokenize(sentence))))
    # # print(tagged_sentence)
    # for tagged in tagged_sentence:
    #     if tagged[1] == "NNP" or tagged[1] == "NNPS":
    #         # if obj.get(tagged[0]) is not None:
    #         #     obj[tagged[0]] += 1
    #         # else:
    #         #     obj[tagged[0]] = 1
    #         if obj.get(tagged[0].strip(",")) is not None:
    #             obj[tagged[0].strip(",").strip(".")] += 1
    #         else:
    #             obj[tagged[0]] = 1
　　
　　# 将对象转为数组对象，便于pd将数据转为一种数据结构，写入excel中  dataframe是一种表格型的数据存储结构，可以看作是几个serie的集合。dataframe既有行索引，也有列索引。
    for o in obj:
        obja = {"word": o, "num": obj[o]}
        arr.append(obja)
    p = pd.DataFrame(arr)
    # print(p)
    p.to_csv('c4i.csv', encoding='utf_8_sig')
    # print(p)


if __name__ == '__main__':
    get_entities()
    # readWord()

使用的依赖库如下所示：

python-docx==0.8.11