python使用KNN文本分类

上次爬取的爸爸、妈妈、老师和自己的作文，利用sklearn.neighbors.KNeighborsClassifier进行分类。

import jieba
import pandas as pd
import numpy as np
import os
import itertools 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA

#读取文件内容
path = 'E:作文'
corpos = pd.DataFrame(columns=['filepath','text','kind'])
for root,dirs,files in os.walk(path):
    for name in files:
        filepath = root+'\'+name
        f = open(filepath,'r',encoding='utf-8')
        text = f.read()
        txt = ''.join(text.split('
'))
        kind = root.split('\')[-1]
        corpos.loc[len(corpos)] = [filepath,text.strip(),kind]

#设置停用词，构建词频矩阵
stopwords = pd.read_csv(r'Stopwords.txt', 
                        encoding='utf-8',sep='
')
def tokenizer(s):
    words=[]
    cut = jieba.cut(s)
    for word in cut:
        words.append(word)
    return words
count = CountVectorizer(tokenizer=tokenizer,
                        stop_words=list(stopwords['stopword']))
countvector = count.fit_transform(corpos.iloc[:,1]).toarray()

#将类别转化为数字
kind = np.unique(corpos['kind'].values)
nkind = np.zeros(700) 
for i in range(len(kind)):
    index = corpos[corpos['kind']==kind[i]].index
    nkind[index] = i+1
         
#将词频矩阵转化为二维数据，画图    
pca = PCA(n_components=2)
newvector = pca.fit_transform(countvector)
plt.figure()
for i,c,m in zip(range(len(kind)),['r','b','g','y'],['o','^','>','<']):
    index = corpos[corpos['kind']==kind[i]].index
    x = newvector[index,0]
    y = newvector[index,1]
    plt.scatter(x,y,c=c,marker=m,label=kind[i])
plt.legend()
plt.xlim(-5,10)
plt.ylim(-20,50)
plt.xlabel('X Label')
plt.ylabel('Y Label')

#随机选出测试集    
index = np.random.randint(0,700,200) 
x_test = countvector[index]
y_test = corpos.iloc[index,2]


#利用knn分类
knn = KNeighborsClassifier()
knn.fit(countvector,corpos.iloc[:,2])
y_pred = knn.predict(x_test)
knn.score(x_test,y_test)

#画knn分类结果的混淆矩阵

knn_confusion = confusion_matrix(y_test,y_pred)
'''
array([[61,  1,  0,  3],

       [ 8, 35,  0,  1],
       [ 1,  0, 53,  1],
       [ 9,  1,  2, 24]])
'''

plt.imshow(knn_confusion,interpolation='nearest',cmap=plt.cm.Oranges) 
plt.xlabel('y_pred')
plt.ylabel('y_True')
tick_marks = np.arange(len(kind))
plt.xticks(tick_marks,kind,rotation=90)
plt.yticks(tick_marks,kind)
plt.colorbar()
plt.title('confustion_matrix')
for i,j in itertools.product(range(len(knn_confusion)),range(len(knn_confusion))):
    plt.text(i,j,knn_confusion[j,i],
             horizontalalignment="center")

数据散点图如下所示：

knn分类结果的混淆矩阵图如下所示：