朴素贝叶斯分类器基本代码 && n折交叉优化 2

这个代码基于上一个代码

不同的是:读取了txt文件,改变了min_ft与max_ft的参数

import re
import pandas as pd
import warnings
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB as MNB #多项分布朴素贝叶斯公式
from sklearn.naive_bayes import BernoulliNB as BNB
from sklearn.model_selection  import cross_val_score
warnings.filterwarnings("ignore")
def proces(col2):
    col2_text=re.sub("[^a-zA-Z]"," ",col2)
    words=col2_text.lower().split()
    #print(words)
    return words
train=pd.read_table('sentimentLabel.txt',lineterminator='
', header=None, names=[0, 1])
print(train.head(5))
train_labers=train[0]
train_texts=train[1]
class_mapping={'Negative':0, 'Positive':1}
train_labers=train_labers.map(class_mapping)
#print(labers)

test=pd.read_table('test.txt', lineterminator='
', header=None, names=[0, 1])
test_labers=test[0]
test_texts=test[1]
test_labers=test_labers.map(class_mapping)

train_data=[]
for i in range(len(train_texts)):
    train_data.append(' '.join(proces(train_texts[i])))
    pass
test_data=[]
for i in range(len(test_texts)):
    test_data.append(' '.join(proces(test_texts[i])))
#print(train_data)
#print(test_data)
data_all = train_data+test_data
#print(data_all)
count_vec = TfidfVectorizer(min_df=1,
                            max_df=60,
                            analyzer='word',
                            ngram_range=(1, 2),
                            use_idf=1,
                            smooth_idf=1,
                            sublinear_tf=1,
                            stop_words='english'
)
length=len(train_data)
count_vec.fit(data_all)
data_all=count_vec.transform(data_all)
#print(data_all)
train_data=data_all[:length]
test_data=data_all[length:]


model=MNB()
#model=BNB()
model.fit(train_data,train_labers)
#pred=model.predict(test_data)
MNB(alpha=1.0, class_prior=False, fit_prior=True)
#print("roc_auc",roc_auc_score(test_labers, pred))
#print("roc_auc",roc_auc_score(w, pred))
'''
MX = 0.7996632996632996
MX_idx = 5
for i in range(400, 500):
    if MX < np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc')):
        MX=np.mean(cross_val_score(model, train_data, train_labers, cv=i, scoring='roc_auc'))
        MX_idx=i
    pass
print("roc_auc",MX, MX_idx)
'''
print("roc_auc", np.mean(cross_val_score(model, train_data, train_labers, cv=297, scoring='roc_auc')))
化繁为简 大巧不工
原文地址:https://www.cnblogs.com/mpeter/p/11172284.html