基于LDA主题模型和SVM的文本分类

用LDA模型抽取文本特征，再用线性SVM分类，发现效果很差，F1=0.654。

Precision:0.680,Recall:0.649,F1:0.654

RandomForestClassifier的表现也比较差：

Precision:0.680,Recall:0.668,F1:0.670

而随便用一个深度学习模型(textCNN,LSTM+Attention)都能达到0.95+的F1，而且还不用处理特征、不用分词。

说下具体流程：提取LDA特征时，需要CountVectorizer来先对文本进行向量化，首先需要对文本进行分词，考虑到样本数量较多（搜狐新闻数据集，5个类别*3000条信息），使用了多进程程(此处用了进程池ProcessPoolExecutor来实现)来进行jieba分词。

import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import multiprocessing
from concurrent.futures import ProcessPoolExecutor,as_completed
from utils import log
from tqdm import tqdm
import time
import pickle as pk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC,SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score,recall_score,f1_score

def transform_text(text,stopwords):
    #对文章进行jieba分词
    words=[w for w in jieba.cut(text) if w.strip() and (w not in stopwords)]
    return ','.join(words)

def cut_texts(lock,texts,stopwords,processName,doc_list=[]):
    #进程+锁的形式来做多进程分词
    log('Process {} is cutting texts...'.format(processName))
    docs=[]
    for text in tqdm(texts):
        doc=transform_text(text,stopwords)
        #log(doc)
        docs.append(doc)
    lock.acquire()
    doc_list.extend(docs)
    lock.release()

def cut_texts_pool(texts,stopwords,processName):
    #分词,此方法将以，进程池方式的方式实现多进程加速执行
    log('Process {} is cutting texts...'.format(processName))
    docs=[]
    for text in tqdm(texts):
        doc=transform_text(text,stopwords)
        #log(doc)
        docs.append(doc)
    log('Process {} finished cutting.'.format(processName))
    return docs

def hard_work(processName):
    #测试方法，模拟耗时操作
    log('Process {} is running...'.format(processName))
    time.sleep(2)
    log('Process {} finished.'.format(processName))
    return processName

def mp_pool_test(texts=None,res=None):
    #多进程测试
    n_process=multiprocessing.cpu_count()
    pool=ProcessPoolExecutor()
    fs=[]
    for i in range(n_process):
        f=pool.submit(hard_work,i)
        fs.append(f)
    names=[]
    for f in as_completed(fs):
        name = f.result()
        names.append(name)
    log(names)

def partition(iterable_,n_parittion):
    #多文本进行分割，大体均分为n_parittion份
    assert isinstance(n_parittion,int) and n_parittion>0,'Invalid value for "n_partition"'
    temp=list(iterable_)
    total=len(temp)
    assert total>n_parittion,'Size of iterable is less than "n_partition"'

    partition_size=total//n_parittion
    res=[]
    for i in range(n_parittion-1):
        res.append(temp[partition_size*i:partition_size*(i+1)])
    res.append(temp[partition_size*(i+1):])
    return res

def mp_cut_pool(texts):
    #有几个CPU就创建几个进程
    n_process=multiprocessing.cpu_count()
    texts=partition(texts,n_process)
    #以进程池的方式进行多进程分词
    pool=ProcessPoolExecutor(max_workers=12)
    fs=[]
    docs=[]
    for i in range(n_process):
        #submit启动进程，第一个参数是目标方法，后面是该方法的参数
        f=pool.submit(cut_texts_pool,texts[i],[],i)
        #f是一个Future对象
        fs.append(f)
    #as_completed返回一个迭代器，当进程池当中的进程执行结束时调用
    for f in as_completed(fs):
        #f.result()获取每个进程的返回值
        docs.extend(f.result())
    return docs

class LDA_Transformer:
    def __init__(self,n_features):
        self.n_features=n_features

    def fit(self,texts):
        log('Building CountVectorizer with texts...')
        ct=CountVectorizer()
        self.count_vectorizer=ct
        log(type(texts))
        if isinstance(texts,list):
            log('Len of texts:{}'.format(len(texts)))
            #log(texts)
        else:
            log('Shape of texts:{}'.format(texts.shape))
        print('texts[0]',texts[0])
        ctv=ct.fit_transform(texts)
        log('Building LDA model with CountVectorizer..')
        #n_components是LDA的主题个数，类似于word embedding的维度大小
        lda=LatentDirichletAllocation(n_components=self.n_features)
        lda.fit(ctv)
        log('Done building LDA model.')
        self.lda_model=lda

    def transform(self,texts):
        count_vec=self.count_vectorizer.transform(texts)
        return self.lda_model.transform(count_vec)

def build_data():
    df=pd.read_excel('data/souhu_news_400_500.xlsx')
    texts=list(df['content'])#文本字段
    log(df.columns)
    docs=mp_cut_pool(texts)
    lda_transformer=LDA_Transformer(64)
    lda_transformer.fit(docs)
    #保存LDA模型到本地
    with open('output/lda_transformer.pkl','wb') as f:
        pk.dump(lda_transformer,f)

    indices=list(range(df.shape[0]))
    np.random.shuffle(indices)
    df=df.iloc[indices]
    dic={topic:i for i,topic in enumerate(list(df['topic'].unique()))}
    y=[dic[topic] for topic in list(df['topic'])]
    with open('data/y_lda.pkl','wb') as f:
        pk.dump(y,f)

    texts=list(df['content'])
    X=lda_transformer.transform(texts)
    with open('data/X_lda.pkl','wb') as f:
        pk.dump(X,f)
    log('Training data is saved.')

def load_train_data():
    with open('data/X_lda.pkl','rb') as f:
        X=pk.load(f)
    with open('data/y_lda.pkl','rb') as f:
        y=pk.load(f)
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
    return X_train,X_test,y_train,y_test

def main():
    log('Building training data...')
    build_data()
    log('Loading training data with LDA features...')
    X_train,X_test,y_train,y_test=load_train_data()
    log('Training LinearSVC model..')
    #model=LinearSVC()
    model=RandomForestClassifier()
    model.fit(X_train,y_train)
    log('Evaluating model...')
    acc=model.score(X_test,y_test)
    log('Accuracy:{}'.format(acc))
    y_pred=model.predict(X_test)
    p=precision_score(y_test,y_pred,average='macro')
    r=recall_score(y_test,y_pred,average='macro')
    f1=f1_score(y_test,y_pred,average='macro')
    log('Precision:{:.3f},Recall:{:.3f},F1:{:.3f}'.format(p,r,f1))


if __name__=='__main__':
    main()