knn 分类Helen数据集

  • knn流程
  1. 数据读取
  2. 数据归一化
  3. knn实现
  • 数据分析

以数据的前两项特征对数据进行划分得到以下散点图:

  • 具体实现
import numpy as np
def read_data(path):
    lines=path.readlines()
    data=[]
    label=[]
    for line in lines:
        line=line.split()
        data.append(list(map(float,line[0:3])))
        if line[-1]=='didntLike':
            label.append(3)
        elif line[-1]=='largeDoses':
            label.append(2)
        else:
            label.append(1)
    return np.array(data),np.array(label)

def normalized(data):
    mindata=data.min(0)

def standdata(traindata):
    meandata0 = np.mean(traindata,axis=0)
    stddata0 = np.std(traindata,axis=0)
    length = traindata.shape[0]
    meandata1 = np.tile(meandata0,(length,1))
    stddata1 = np.tile(stddata0,(length,1))
    standdata = (traindata-meandata1)/stddata1
    return standdata, meandata0, stddata0
def autoNorm(x):
    """
    最大值最小值归一化
    :param x: 需要归一化的特征向量
    :return: 新的数组、极差、最小值
    """

    minVals=x.min(axis=0)
    maxVals=x.max(axis=0)
    ranges=maxVals-minVals

    x_new=(x-minVals)/ranges # 广播

    return x_new,ranges,minVals

def knn(traindata,testdata,label,k):
    distance=np.sqrt(np.sum((traindata-testdata)**2,axis=1))
    p=distance.argsort()
    vote = [0, 0, 0]
    for i in range(k):
        vote[label[p[i]]-1]=vote[label[p[i]]-1]+1
    return vote.index(max(vote))+1

def testknn(data,label,k):
    # per=np.random.permutation(np.shape(data)[0])
    # new_data=data[per,:]
    # new_label=label[per]
    train_data=data[0:int(np.shape(data)[0]*0.9)]
    train_label=label[0:int(np.shape(data)[0]*0.9)]
    test_data=data[int(np.shape(data)[0]*0.9):]
    test_label = label[int(np.shape(data)[0] * 0.9):]
    true_label=0
    for i in range(len(test_label)):
        result_a=knn(train_data,test_data[i],train_label,k)
        if result_a==test_label[i]:
            true_label=true_label+1
        print(result_a,test_label[i])
    acc=float(true_label)/len(test_data)
    return acc

if __name__ == '__main__':
    path='Knn_Helen'
    true_label=["smallDoses",'largeDoses','didntLike']
    file=open(path,'r')
    print('=======')
    data,label=read_data(file)
    a,b,c=autoNorm(data)
    acc=testknn(a,label,25)
    print(acc)

准确率能达到95%以上

原文地址:https://www.cnblogs.com/peng-yuan/p/14703275.html