ptyhon中文本挖掘精简版

import xlrd
import jieba
import sys  
import importlib
import os         #python内置的包,用于进行文件目录操作,我们将会用到os.listdir函数  
import pickle    #导入cPickle包并且取一个别名pickle #持久化类
import random
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pylab import mpl  
from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法包
from sklearn import svm

from sklearn import metrics 
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
importlib.reload(sys)


#把内容和类别转化成一个向量的形式
trainContentdatasave=[] #存储所有训练和测试数据的分词
testContentdatasave=[]

trainContentdata = []
testContentdata = []
trainlabeldata = []
testlabeldata = []

#导入文本描述的训练和测试数据
def importTrainContentdata():
    file = '20180716_train.xls'
    wb = xlrd.open_workbook(file)
    ws = wb.sheet_by_name("Sheet1")
    for r in range(ws.nrows):
        trainContentdata.append(ws.cell(r, 0).value)

def importTestContentdata():
    file = '20180716_test.xls'
    wb = xlrd.open_workbook(file)
    ws = wb.sheet_by_name("Sheet1")
    for r in range(ws.nrows):
        testContentdata.append(ws.cell(r, 0).value)   

#导入类别的训练和测试数据
def importTrainlabeldata():
    file = '20180716_train_label.xls'
    wb = xlrd.open_workbook(file)
    ws = wb.sheet_by_name("Sheet1")
    for r in range(ws.nrows):
        trainlabeldata.append(ws.cell(r, 0).value)
        
def importTestlabeldata():
    file = '20180716_test_label.xls'
    wb = xlrd.open_workbook(file)
    ws = wb.sheet_by_name("Sheet1")
    for r in range(ws.nrows):
        testlabeldata.append(ws.cell(r, 0).value)


if __name__=="__main__": 
    
    importTrainContentdata()
    importTestContentdata()
    importTrainlabeldata()
    importTestlabeldata()
    
    '''贝叶斯
    clf = MultinomialNB(alpha=0.052).fit(train_set.tdm, train_set.label)  
    #clf = svm.SVC(C=0.7, kernel='poly', gamma=10, decision_function_shape='ovr')
    clf.fit(train_set.tdm, train_set.label)  
    predicted=clf.predict(test_set.tdm)
    
    逻辑回归
    tv = TfidfVectorizer()
    train_data = tv.fit_transform(X_train)
    test_data = tv.transform(X_test)
    
    lr = LogisticRegression(C=3)
    lr.fit(train_set.tdm, train_set.label)
    predicted=lr.predict(test_set.tdm)
    print(lr.score(test_set.tdm, test_set.label))
    #print(test_set.tdm)
    
    #SVM
    clf = SVC(C=1500)
    clf.fit(train_set.tdm, train_set.label)
    predicted=clf.predict(test_set.tdm)
    print(clf.score(test_set.tdm, test_set.label))
    '''
    
    tv = TfidfVectorizer()
    train_data = tv.fit_transform(trainContentdata)
    test_data = tv.transform(testContentdata)

    clf = SVC(C=1500)
    clf.fit(train_data, trainlabeldata)
    print(clf.score(test_data, testlabeldata))
    
    
    
    a=[]
    b=[]
    for i in range(len(predicted)):
        b.append((int)(float(predicted[i])))
        a.append(int(test_set.label[i][0]))
    
    '''
    f=open('F:/goverment/ArticleMining/predict.txt', 'w')
    for i in range(len(predicted)):
       f.write(str(b[i]))
       f.write('
')
    f.write("写好了")
    f.close()
    #for i in range(len(predicted)):
        #print(b[i])
    '''
    #metrics_result(a, b)
原文地址:https://www.cnblogs.com/caiyishuai/p/9354035.html