短文本情感分类(一)

# coding:utf-8
# 将文本分词处理
import jieba

stoplist = {}.fromkeys([ line.strip() for line in open('/Test/orgindata/stopwords.txt') ])

input = open('/Test/orgindata/corpus.txt','r')
output = open('/Test/process2/corpus-seg.txt','w+')

line = input.readline()
index = 0
text = ''
while line!=None and len(line) > 4:
    #去除头部和尾部的<content> </content>
    line = line[9:-11]

    # segments = thu1.cut(line, text=True)
    segments = jieba.cut(line)

    # segments = segments.split(' ')
    segments = [word for word in list(segments) if word not in stoplist]

    result = ''
    for segment in segments:
        if len(segment)>1:
            result  += segment + ' '
    line = input.readline()
    if len(result) > 4:
        text += result
        index += 1
        if index%100 == 0:
            output.write(text.encode('utf-8') + '
')
            text = ''
            print('line '+str(index))

print '处理完成'

  

原文地址:https://www.cnblogs.com/mengxingxinqing/p/8026990.html