团队项目冲刺第三天 数据清洗 一

数据清洗 一

数据已经取出,然后是对数据的一个清洗

其中中文文本的分类

需要分词

还需要进行 停用词的取出

以防对特征值的抽取造成过大影响

分词是为了进行特征抽取的一个词语分割 然后进行提取作用

# encoding=utf-8                         #遍历文件,用ProsessofWords处理文件
from imp import reload
import jieba
import os
import numpy as np
import sys

reload(sys)


def EnumPathFiles(path, callback, stop_words_list):
if not os.path.isdir(path):
print('Error:"', path, '" is not a directory or does not exist.')
return
list_dirs = os.walk(path)

for root, dirs, files in list_dirs:
for d in dirs:
print(d)
EnumPathFiles(os.path.join(root, d), callback, stop_words_list)
for f in files:
callback(root, f, stop_words_list)


def ProsessofWords(textpath, stop_words_list):
f = open(textpath, 'r', encoding='utf-8')
text = f.read()
f.close()
result = list()
outstr = ''
seg_list = jieba.cut(text, cut_all=False)
for word in seg_list:
if word not in stop_words_list:
if word != ' ':
outstr += word
outstr += " "
f = open(textpath, 'w+', encoding='utf-8')
f.write(outstr)
f.close()


def callback1(path, filename, stop_words_list):
textpath = path + '\' + filename
print(textpath)
ProsessofWords(textpath, stop_words_list)


if __name__ == '__main__':
stopwords_file = "../stopword/stopword.txt"
stop_f = open(stopwords_file, "r", encoding='utf-8')
stop_words = list()
for line in stop_f.readlines():
line = line.strip()
if not len(line):
continue
stop_words.append(line)
stop_f.close()
print(len(stop_words))

EnumPathFiles(r'../article', callback1, stop_words)
原文地址:https://www.cnblogs.com/huangmouren233/p/14759878.html