python文本挖掘模版

import xlrd
import jieba
import sys  
import importlib
import os         #python内置的包，用于进行文件目录操作，我们将会用到os.listdir函数  
import pickle    #导入cPickle包并且取一个别名pickle #持久化类
import random
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from pylab import mpl  
from sklearn.naive_bayes import MultinomialNB # 导入多项式贝叶斯算法包
from sklearn import svm

from sklearn import metrics 
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer
importlib.reload(sys)


#把内容和类别转化成一个向量的形式
trainContentdatasave=[] #存储所有训练和测试数据的分词
testContentdatasave=[]

trainContentdata = []
testContentdata = []
trainlabeldata = []
testlabeldata = []

#导入文本描述的训练和测试数据
def importTrainContentdata():
    file = '20180716_train.xls'
    wb = xlrd.open_workbook(file)
    ws = wb.sheet_by_name("Sheet1")
    for r in range(ws.nrows):
        col = []
        for c in range(1):
            col.append(ws.cell(r, c).value)
        trainContentdata.append(col)

def importTestContentdata():
    file = '20180716_test.xls'
    wb = xlrd.open_workbook(file)
    ws = wb.sheet_by_name("Sheet1")
    for r in range(ws.nrows):
        col = []
        for c in range(1):
            col.append(ws.cell(r, c).value)
        testContentdata.append(col)   

#导入类别的训练和测试数据
def importTrainlabeldata():
    file = '20180716_train_label.xls'
    wb = xlrd.open_workbook(file)
    ws = wb.sheet_by_name("Sheet1")
    for r in range(ws.nrows):
        col = []
        for c in range(1):
            col.append(ws.cell(r, c).value)
        trainlabeldata.append(col)
        
def importTestlabeldata():
    file = '20180716_test_label.xls'
    wb = xlrd.open_workbook(file)
    ws = wb.sheet_by_name("Sheet1")
    for r in range(ws.nrows):
        col = []
        for c in range(1):
            col.append(ws.cell(r, c).value)
        testlabeldata.append(col)
"""
def importClassSet():
    file = 'ClassSet.xls'
    wb = xlrd.open_workbook(file)
    ws = wb.sheet_by_name("Sheet1")
    for r in range(ws.nrows):
        col = []
        for c in range(ws.ncols):
            col.append(ws.cell(r, c).value)
        ClassSet.append(col)
"""
def buildtrainbunch(bunch_path):
    bunch = Bunch(label=[],contents=[]) 
    for item1 in trainlabeldata:
        bunch.label.append(item1)
        
    for item2 in trainContentdata:
        item2=str(item2)
        item2 = item2.replace("
", "")
        item2 = item2.replace(" ", "")
        content_seg=jieba.cut(item2)
        save2=''
        for item3 in content_seg:
            if len(item3) > 1 and item3!='
':
                trainContentdatasave.append(item3)
                save2=save2+","+item3
        bunch.contents.append(save2)
    with open(bunch_path, "wb") as file_obj:  
        pickle.dump(bunch, file_obj)  
    print("构建训练数据文本对象结束！！！")

def buildtestbunch(bunch_path):
    bunch = Bunch(label=[],contents=[]) 
    for item1 in testlabeldata:
        bunch.label.append(item1)
        
    for item2 in testContentdata:
        item2=str(item2)
        item2 = item2.replace("
", "")
        item2 = item2.replace(" ", "")
        content_seg=jieba.cut(item2)
        save2=''
        for item3 in content_seg:
            if len(item3) > 1 and item3!='
':
                testContentdatasave.append(item3)
                save2=save2+","+item3
        bunch.contents.append(save2)
    with open(bunch_path, "wb") as file_obj:  
        pickle.dump(bunch, file_obj)  
    print("构建测试数据文本对象结束！！！")
    

#读取停用词
def _readfile(path):  
    with open(path, "rb") as fp:  
        content = fp.read()  
    return content  

# 读取bunch对象  
def _readbunchobj(path):  
    with open(path, "rb") as file_obj:  
        bunch = pickle.load(file_obj)  
    return bunch  
 
# 写入bunch对象  
def _writebunchobj(path, bunchobj):  
    with open(path, "wb") as file_obj:  
        pickle.dump(bunchobj, file_obj) 
    
def vector_space(stopword_path,bunch_path,space_path):
    
    stpwrdlst = _readfile(stopword_path).splitlines()#读取停用词  
    bunch = _readbunchobj(bunch_path)#导入分词后的词向量bunch对象  
    #构建tf-idf词向量空间对象  
    tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={})  
    '''
    权重矩阵tdm，其中，权重矩阵是一个二维矩阵，tdm[i][j]表示，第j个词（即词典中的序号）在第i个类别中的IF-IDF值
    '''
    #使用TfidVectorizer初始化向量空间模型
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001,use_idf=False,max_features=10000)
    #print(vectorizer)
    #文本转为词频矩阵，单独保存字典文件
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)  
    tfidfspace.vocabulary = vectorizer.vocabulary_ 
    #创建词袋的持久化
    _writebunchobj(space_path, tfidfspace)  
    print("if-idf词向量空间实例创建成功！！！")

def testvector_space(stopword_path,bunch_path,space_path,train_tfidf_path):
    
    stpwrdlst = _readfile(stopword_path).splitlines()#把停用词变成列表  
    bunch = _readbunchobj(bunch_path)  
    tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={}) 
    '''
    tdm存放的是计算后得到的TF-IDF权重矩阵.
    vocabulary是词向量空间的索引，例如，如果我们定义的词向量空间是（我，喜欢，相国大人），那么vocabulary就是这样一个索引字典 
    vocabulary={"我":0,"喜欢":1,"相国大人":2}，你可以简单的理解为：vocabulary就是词向量空间的坐标轴，索引值相当于表明了第几个维度。 
    '''
    #导入训练集的TF-IDF词向量空间  ★★
    trainbunch = _readbunchobj(train_tfidf_path)
    tfidfspace.vocabulary = trainbunch.vocabulary  
    '''
    关于参数，你只需要了解这么几个就可以了： 
    stop_words: 
    传入停用词，以后我们获得vocabulary_的时候，就会根据文本信息去掉停用词得到 
    vocabulary: 
    之前说过，不再解释。 
    sublinear_tf: 
    计算tf值采用亚线性策略。比如，我们以前算tf是词频，现在用1+log(tf)来充当词频。 
    smooth_idf: 
    计算idf的时候log(分子/分母)分母有可能是0，smooth_idf会采用log(分子/(1+分母))的方式解决。默认已经开启，无需关心。 
    norm: 
    归一化，我们计算TF-IDF的时候，是用TF*IDF，TF可以是归一化的，也可以是没有归一化的，一般都是采用归一化的方法，默认开启. 
    max_df: 
    有些词，他们的文档频率太高了（一个词如果每篇文档都出现，那还有必要用它来区分文本类别吗？当然不用了呀），所以，我们可以 
    设定一个阈值，比如float类型0.5（取值范围[0.0,1.0]）,表示这个词如果在整个数据集中超过50%的文本都出现了，那么我们也把它列 
    为临时停用词。当然你也可以设定为int型，例如max_df=10,表示这个词如果在整个数据集中超过10的文本都出现了，那么我们也把它列 
    为临时停用词。 
    min_df: 
    与max_df相反，虽然文档频率越低，似乎越能区分文本，可是如果太低，例如10000篇文本中只有1篇文本出现过这个词，仅仅因为这1篇 
    文本，就增加了词向量空间的维度，太不划算。 
    当然，max_df和min_df在给定vocabulary参数时，就失效了。 
    '''  
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary, min_df=0.001)  
    
    #print(vectorizer)
    
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    _writebunchobj(space_path, tfidfspace)  
    print("if-idf词向量空间实例创建成功！！！")

def metrics_result(actual, predict):  #  metrics.f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    print('精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted', labels=np.unique(predict)))) 
    print('召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted', labels=np.unique(predict))))
    print('f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict, average='weighted', labels=np.unique(predict))))
    #准确率和召回率是相互影响的，理想情况下是二者都高，但是一般情况下准确率高，召回率就低；召回率高，准确率就低  
  
if __name__=="__main__":  
    
    importTrainContentdata()
    importTestContentdata()
    importTrainlabeldata()
    importTestlabeldata()
    
    #导入分词后的词向量bunch对象
    train_bunch_path ="F:/goverment/ArticleMining/trainbunch.bat"#Bunch保存路径
    test_bunch_path ="F:/goverment/ArticleMining/testbunch.bat"
    stopword_path ="F:/goverment/ArticleMining/hlt_stop_words.txt"
    train_space_path = "F:/goverment/ArticleMining/traintfdifspace.dat"
    test_space_path = "F:/goverment/ArticleMining/testtfdifspace.dat"
    
    #对训练和测试集进行bunch操作
    buildtrainbunch(train_bunch_path)
    buildtestbunch(test_bunch_path)
    
    vector_space(stopword_path,train_bunch_path,train_space_path)  
    testvector_space(stopword_path,test_bunch_path,test_space_path,train_space_path)
    
    #导入训练和测试数据集
    train_set=_readbunchobj(train_space_path)
    test_set=_readbunchobj(test_space_path)
    
    print(train_set.tdm)
    '''
    mm=0
    ii=0
    jj=0
    for i in range(3142):
        for j in range(3142):
            if train_set.tdm[i][j] >mm:
                mm=train_set.tdm[i][j]
                ii=i
                jj=j
    print(ii)
    print(jj)
    '''        

    #test_set.tdm
    #train_set.label
    # 训练分类器：输入词袋向量和分类标签，alpha:0.001 alpha越小，迭代次数越多，精度越高  
    
    #低召回、F1： 0.75 rbf:0.59    0.8 rbf 0.578
    #c0.75 poly 66.5 精度:0.665 gamma=10 召回:0.330  f1-score:0.416
    #C=0.7, kernel='poly', gamma=10 召回:0.331 f1-score:0.417
    # alpha:0.001 alpha 越小，迭代次数越多，精度越高
    '''
    clf = MultinomialNB(alpha=0.052).fit(train_set.tdm, train_set.label)  
    #clf = svm.SVC(C=0.7, kernel='poly', gamma=10, decision_function_shape='ovr')
    clf.fit(train_set.tdm, train_set.label)  
    predicted=clf.predict(test_set.tdm)

    tv = TfidfVectorizer()
    train_data = tv.fit_transform(X_train)
    test_data = tv.transform(X_test)
    
    lr = LogisticRegression(C=3)
    lr.fit(train_set.tdm, train_set.label)
    predicted=lr.predict(test_set.tdm)
    print(lr.score(test_set.tdm, test_set.label))
    #print(test_set.tdm)
    '''
    
    clf = SVC(C=1500)
    clf.fit(train_set.tdm, train_set.label)
    predicted=clf.predict(test_set.tdm)
    print(clf.score(test_set.tdm, test_set.label))
    
    '''
    from sklearn.neighbors import KNeighborsClassifier  
    knnclf = KNeighborsClassifier(n_neighbors=9)#default with k=5  
    knnclf.fit(train_set.tdm,train_set.label)  
    predicted = knnclf.predict(test_set.tdm)
    '''
    a=[]
    b=[]
    for i in range(len(predicted)):
        b.append((int)(float(predicted[i])))
        a.append(int(test_set.label[i][0]))
    
    f=open('F:/goverment/ArticleMining/predict.txt', 'w')
    for i in range(len(predicted)):
       f.write(str(b[i]))
       f.write('
')
    f.write("写好了")
    f.close()
    #for i in range(len(predicted)):
        #print(b[i])
    
    metrics_result(a, b)