python实现knn

邻近算法，或者说K最近邻(kNN，k-NearestNeighbor)分类算法是数据挖掘分类技术中最简单的方法之一。所谓K最近邻，就是k个最近的邻居的意思，说的是每个样本都可以用它最接近的k个邻居来代表。

kNN算法的核心思想是如果一个样本在特征空间中的k个最相邻的样本中的大多数属于某一个类别，则该样本也属于这个类别，并具有这个类别上样本的特性。

概念很简单，更多的解释可以参考百度百科，有图有示例，讲的非常清楚。

接下来看看怎么用python实现KNN，代码中都是详细的注释：

首先是对载入数据的部分函数，这里主要看看CIFIA10的数据格式就知道代码的意思了

 1 from __future__ import print_function
 2 
 3 from six.moves import cPickle as pickle
 4 import numpy as np
 5 import os
 6 from scipy.misc import imread
 7 import platform
 8 
 9 def load_pickle(f):
10     version = platform.python_version_tuple()
11     if version[0] == '2':
12         return  pickle.load(f)
13     elif version[0] == '3':
14         return  pickle.load(f, encoding='latin1')
15     raise ValueError("invalid python version: {}".format(version))
16 
17 def load_CIFAR_batch(filename):
18   """ CIRAR的数据是分批的，这个函数的功能是载入一批数据 """
19   with open(filename, 'rb') as f:
20     datadict = load_pickle(f) #以二进制方式打开文件
21     X = datadict['data']
22     Y = datadict['labels']
23     X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
24     Y = np.array(Y)
25     return X, Y
26 
27 def load_CIFAR10(ROOT):
28   """ load 所有的数据 """
29   xs = []
30   ys = []
31   for b in range(1,6):
32     f = os.path.join(ROOT, 'data_batch_%d' % (b, ))
33     X, Y = load_CIFAR_batch(f)
34     xs.append(X)
35     ys.append(Y)    
36   Xtr = np.concatenate(xs)
37   Ytr = np.concatenate(ys)
38   del X, Y
39   Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
40   return Xtr, Ytr, Xte, Yte

然后是KNN类，定义了KNN的距离的计算方式、训练和预测函数：

  1 import numpy as np
  2 
  3 class KNearestNeighbor(object):
  4   """ 
  5   kNN 分类器 
  6   这里度量两张图片之间的距离就直接简单的采用L2距离
  7   实际上要达到比较好的效果需要设计更好的距离距离方式 
  8   """
  9 
 10   def __init__(self):
 11     pass
 12 
 13   def train(self, X, y):
 14     """
 15     训练过程基本上没有什么操作，只是简单的记录下所有的数据
 16 
 17     Inputs:
 18     - X(N, D) N个输入图片，每张图片表示为D位向量 
 19     - y(N,) 标签
 20     """
 21     self.X_train = X
 22     self.y_train = y
 23     
 24   def predict(self, X, k=1, num_loops=0):
 25     """
 26     对于新的输入，给出预测分类
 27 
 28     Inputs:
 29     - X(num_test, D) 
 30     - k: 选择用来决定输出的最相近邻居的个数
 31     - num_loops:这里实现了3种方式来实现L2距离的计算，比较一下计算速度，
 32                 都是利用了numpy的broadcast机制。
 33                 可以看到使用numpy内置的方式计算速度远远高于自己写的循环
 34 
 35     Returns:
 36     - y(num_test,)：预测的分类下标 
 37  
 38     """
 39     if num_loops == 0:
 40       dists = self.compute_distances_no_loops(X)
 41     elif num_loops == 1:
 42       dists = self.compute_distances_one_loop(X)
 43     elif num_loops == 2:
 44       dists = self.compute_distances_two_loops(X)
 45     else:
 46       raise ValueError('Invalid value %d for num_loops' % num_loops)
 47 
 48     return self.predict_labels(dists, k=k)
 49 
 50   def compute_distances_two_loops(self, X):
 51     """
 52     Inputs:
 53     - X(num_test, D)：test data.
 54 
 55     Returns:
 56     - dists(num_test, num_train)：dists[i, j]表示测试数据i和训练数据j之间的L2距离
 57     """
 58 
 59     num_test = X.shape[0]
 60     num_train = self.X_train.shape[0]
 61     dists = np.zeros((num_test, num_train))
 62     for i in range(num_test):
 63       for j in range(num_train):
 64         dists[i,j]=np.sqrt(np.sum(np.square(X[i]-self.X_train[j])))
 65     return dists
 66 
 67   def compute_distances_one_loop(self, X):
 68     num_test = X.shape[0]
 69     num_train = self.X_train.shape[0]
 70     dists = np.zeros((num_test, num_train))
 71     for i in range(num_test):
 72       dists[i,:]=np.sqrt(np.sum(np.square(X[i]-self.X_train),axis=1))
 73     return dists
 74 
 75   def compute_distances_no_loops(self, X):
 76  
 77     num_test = X.shape[0]
 78     num_train = self.X_train.shape[0]
 79     dists = np.zeros((num_test, num_train)) 
 80     #这里需要使用一点矩阵和广播的小技巧，具体的看下面的操作自己体会
 81     dists+=(np.sum(np.square(X),axis=1)).reshape(-1,1)
 82     dists+=(np.sum(np.square(self.X_train),axis=1)).reshape(1,-1)
 83     dists-=2*np.dot(X,self.X_train.T)
 84     dists=np.sqrt(dists)
 85     
 86     return dists
 87 
 88   def predict_labels(self, dists, k=1):
 89     """
 90     给出测试图片和训练图片的距离矩阵，为每个测试图片分类
 91 
 92     Inputs:
 93     - dists(num_test, num_train) 
 94 
 95     Returns:
 96     - y: (num_test,)   
 97     """
 98 
 99     num_test = dists.shape[0]
100     y_pred = np.zeros(num_test)
101     for i in range(num_test):
102       # 长度为k的list保存第i张测试图片距离最近的训练数据的下标
103       closest_y = []
104       closest_y=self.y_train[np.argsort(dists[i])[:k]]
105       y_pred[i]=np.argmax(np.bincount(closest_y))
106     return y_pred

最后是主函数部分，载入数据，调用KNN类的实例去训练和预测。并使用k折交叉验证去选择合适的超参数k：

  1 # coding: utf-8
  2 
  3 # KNN
  4 # KNN分类器主要分为两个步骤：
  5 # - 训练阶段, 简单的记忆所有的输入数据（存储）
  6 # - 预测阶段, 对与每一个输入，在所有的存储数据中选择k个与输入最接近的
  7 # - k是超参数
  8 # 
  9 
 10 
 11 import random
 12 import numpy as np
 13 from cs231n.data_utils import load_CIFAR10
 14 import matplotlib.pyplot as plt
 15 
 16 
 17 #get_ipython().run_line_magic('matplotlib', 'inline')
 18 plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
 19 plt.rcParams['image.interpolation'] = 'nearest'
 20 plt.rcParams['image.cmap'] = 'gray'
 21 
 22 # Load CIFAR-10 的数据.
 23 cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
 24 X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
 25 
 26 # 通过输出数据的维度检查数据加载是否正确
 27 print('Training data shape: ', X_train.shape)
 28 print('Training labels shape: ', y_train.shape)
 29 print('Test data shape: ', X_test.shape)
 30 print('Test labels shape: ', y_test.shape)
 31 
 32 
 33 # 可视化一些数据集中的样例.
 34 classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
 35 num_classes = len(classes)
 36 samples_per_class = 7
 37 for y, cls in enumerate(classes):
 38     idxs = np.flatnonzero(y_train == y)   #得到每一类对应图片的下标
 39     idxs = np.random.choice(idxs, samples_per_class, replace=False) #在该类的所有图片中随机选择
 40     for i, idx in enumerate(idxs):
 41         plt_idx = i * num_classes + y + 1
 42         plt.subplot(samples_per_class, num_classes, plt_idx)
 43         plt.imshow(X_train[idx].astype('uint8'))
 44         plt.axis('off')
 45         if i == 0:
 46             plt.title(cls)
 47 plt.show()
 48 
 49 
 50 # 采样，不使用全部数据，训练的更快一点，先来看看效果
 51 # 程序全部跑通之后可以优化一下方式，使用全部数据来试试效果
 52 num_training = 5000
 53 mask = list(range(num_training))
 54 X_train = X_train[mask]
 55 y_train = y_train[mask]
 56 
 57 num_test = 500
 58 mask = list(range(num_test))
 59 X_test = X_test[mask]
 60 y_test = y_test[mask]
 61 
 62 
 63 # 把图片Reshape到一维 
 64 X_train = np.reshape(X_train, (X_train.shape[0], -1))
 65 X_test = np.reshape(X_test, (X_test.shape[0], -1))
 66 print(X_train.shape, X_test.shape)
 67 
 68 
 69 from cs231n.classifiers import KNearestNeighbor
 70 
 71 classifier = KNearestNeighbor()
 72 classifier.train(X_train, y_train)
 73 
 74 dists = classifier.compute_distances_two_loops(X_test)
 75 print(dists.shape)
 76 
 77 # 可视化距离矩阵，每一行代表一张输入图片到所有训练数据的距离
 78 plt.imshow(dists, interpolation='none')
 79 plt.show()
 80 
 81 
 82 
 83 y_test_pred = classifier.predict_labels(dists, k=1)
 84 num_correct = np.sum(y_test_pred == y_test)
 85 accuracy = float(num_correct) / num_test
 86 print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
 87 
 88 
 89 
 90 y_test_pred = classifier.predict_labels(dists, k=5)
 91 num_correct = np.sum(y_test_pred == y_test)
 92 accuracy = float(num_correct) / num_test
 93 print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))
 94 
 95 
 96 
 97 
 98 dists_one = classifier.compute_distances_one_loop(X_test)
 99 
100 # 验证2种实现方式得到的距离矩阵是否等价
101 difference = np.linalg.norm(dists - dists_one, ord='fro')
102 print('Difference was: %f' % (difference, ))
103 if difference < 0.001:
104     print('Good! The distance matrices are the same')
105 else:
106     print('Uh-oh! The distance matrices are different')
107 
108 
109 
110 dists_two = classifier.compute_distances_no_loops(X_test)
111 difference = np.linalg.norm(dists - dists_two, ord='fro')
112 print('Difference was: %f' % (difference, ))
113 if difference < 0.001:
114     print('Good! The distance matrices are the same')
115 else:
116     print('Uh-oh! The distance matrices are different')
117 
118 
119 
120 def time_function(f, *args):
121     """
122     计算完成f函数花费的时间
123     """
124     import time
125     tic = time.time()
126     f(*args)
127     toc = time.time()
128     return toc - tic
129 
130 two_loop_time = time_function(classifier.compute_distances_two_loops, X_test)
131 print('Two loop version took %f seconds' % two_loop_time)
132 
133 one_loop_time = time_function(classifier.compute_distances_one_loop, X_test)
134 print('One loop version took %f seconds' % one_loop_time)
135 
136 no_loop_time = time_function(classifier.compute_distances_no_loops, X_test)
137 print('No loop version took %f seconds' % no_loop_time)
138 
139 
140 #使用交叉验证决定k值
141 num_folds = 5
142 k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
143 
144 X_train_folds = []
145 y_train_folds = []
146 X_train_folds=np.array_split(X_train,num_folds)
147 y_train_folds=np.array_split(y_train,num_folds)
148 print(X_train_folds[0].shape)
149 print(y_train_folds[0].shape)
150 
151 
152 #记录不同的k值对应的正确率，每个k值会对应num_folds个正确率
153 k_to_accuracies = {}
154 
155 for k_ in k_choices:
156     k_to_accuracies.setdefault(k_, [])
157 for i in range(num_folds):
158     classifier = KNearestNeighbor()
159     X_val_train = np.concatenate(X_train_folds[0:i] + X_train_folds[i+1:],axis=0)
160     y_val_train = np.concatenate(y_train_folds[0:i] + y_train_folds[i+1:],axis=0)
161     classifier.train(X_val_train, y_val_train)
162     for k_ in k_choices:
163         y_val_pred = classifier.predict(X_train_folds[i], k=k_)
164         num_correct = np.sum(y_val_pred == y_train_folds[i])
165         accuracy = float(num_correct) / len(y_val_pred)
166         k_to_accuracies[k_] = k_to_accuracies[k_] + [accuracy]
167 
168 
169 
170 for k in sorted(k_to_accuracies):
171     for accuracy in k_to_accuracies[k]:
172         print('k = %d, accuracy = %f' % (k, accuracy))
173 
174 
175 for k in k_choices:
176     accuracies = k_to_accuracies[k]
177     plt.scatter([k] * len(accuracies), accuracies)
178 
179 
180 accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
181 accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
182 plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
183 plt.title('Cross-validation on k')
184 plt.xlabel('k')
185 plt.ylabel('Cross-validation accuracy')
186 plt.show()
187 
188 
189 #选择最好的k值计算正确率
190 best_k = k_choices[np.argmax(accuracies_mean)]
191 
192 classifier = KNearestNeighbor()
193 classifier.train(X_train, y_train)
194 y_test_pred = classifier.predict(X_test, k=best_k)
195 
196 
197 num_correct = np.sum(y_test_pred == y_test)
198 accuracy = float(num_correct) / num_test
199 print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))