k-means算法的Python实现

  1 #coding=utf-8
  2 import codecs
  3 import numpy
  4 from numpy import *
  5 import pylab
  6 
  7 def loadDataSet(fileName):
  8     dataMat = []
  9     fr = codecs.open(fileName)
 10     for line in fr.readlines():
 11         curLine = line.strip().split('	')
 12         fltLine = map(float, curLine)
 13         dataMat.append(fltLine)
 14     return dataMat    
 15     
 16 def distMeasure(vecA, vecB):
 17     #print vecA
 18     dist = sqrt(sum(power(vecA - vecB, 2)))
 19     return dist
 20     
 21 def kMeansInitCentroids(X, K):
 22     """
 23     KMEANSINITCENTROIDS This function initializes K centroids that are to be 
 24     used in K-Means on the dataset X
 25     centroids = KMEANSINITCENTROIDS(X, K) returns K initial centroids to be
 26     used with the K-Means on the dataset X.
 27     """
 28     n = shape(X)[1]
 29     centroids = mat(zeros((K,n)))
 30     for j in range(n):
 31         #print X[:,j]
 32         minJ = min(X[:,j])
 33         rangeJ = float(max(array(X)[:,j]) - minJ)
 34         centroids[:,j] = minJ + rangeJ * random.rand(K,1)
 35     return centroids
 36     
 37 def findClosestCentroids(X, centroids):
 38     """
 39     FINDCLOSESTCENTROIDS computes the centroid memberships for every example
 40     idx = FINDCLOSESTCENTROIDS (X, centroids) returns the closest centroids
 41     in idx for a dataset X where each row is a single example. idx = m x 1 
 42     vector of centroid assignments (i.e. each entry in range [1..K])
 43     """
 44     # 数据总量
 45     m = shape(X)[0]
 46     K = shape(centroids)[0]
 47     clusterAssment = mat(zeros((m,2)))#create mat to assign data points 
 48                                       #to a centroid, also holds SE of each point
 49     #centroids = createCent(dataSet, k)
 50     clusterChanged = True
 51     while clusterChanged:
 52         clusterChanged = False
 53         for i in range(m):#for each data point assign it to the closest centroid
 54             minDist = inf; minIndex = -1
 55             # k个中间数据（质心）都与数据i进行欧氏比较，选择距离最近的第minIndex类
 56             for j in range(K):
 57                 distJI = distMeasure(centroids[j,:],X[i,:])
 58                 if distJI < minDist:
 59                     minDist = distJI; minIndex = j
 60             if clusterAssment[i,0] != minIndex: clusterChanged = True
 61             clusterAssment[i,:] = minIndex,minDist**2
 62     return clusterAssment
 63     
 64 def computeCentroids(X, clusterAssment, K):
 65     """
 66     COMPUTECENTROIDS returs the new centroids by computing the means of the 
 67     data points assigned to each centroid.
 68     centroids = COMPUTECENTROIDS(X, idx, K) returns the new centroids by 
 69     computing the means of the data points assigned to each centroid. It is
 70     given a dataset X where each row is a single data point, a vector
 71     idx of centroid assignments (i.e. each entry in range [1..K]) for each
 72     example, and K, the number of centroids. You should return a matrix
 73     centroids, where each row of centroids is the mean of the data points
 74     assigned to it.
 75     """
 76     n = shape(X)[1]
 77     centroids = mat(zeros((K,n)))
 78     for centroid in range(K):#recalculate centroids
 79         # nonzero会产生两个array，第一个非零的为序号列表
 80         ptsInClust = X[nonzero(clusterAssment[:,0].A==centroid)[0]]#get all the point in this cluster
 81         #print 'ererer:',ptsInClust,'dfdf'
 82         centroids[centroid,:] = mean(ptsInClust, axis=0) #assign centroid to mean
 83     return centroids
 84 
 85 def show(dataSet, k, centroids, clusterAssment):
 86     from matplotlib import pyplot as plt  
 87     numSamples, dim = dataSet.shape  
 88     mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']  
 89     print type(dataSet)
 90     for i in xrange(numSamples):  
 91         markIndex = int(clusterAssment[i, 0])  
 92         plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])  
 93     mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']  
 94     for i in range(k):  
 95         plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize = 12)  
 96     plt.show()
 97     
 98 def runkMeans(X, initial_centroids,max_iters, plot_progress):
 99     """
100     RUNKMEANS runs the K-Means algorithm on data matrix X, where each row of X
101     is a single example
102     [centroids, idx] = RUNKMEANS(X, initial_centroids, max_iters, ...
103     plot_progress) runs the K-Means algorithm on data matrix X, where each 
104     row of X is a single example. It uses initial_centroids used as the
105     initial centroids. max_iters specifies the total number of interactions 
106     of K-Means to execute. plot_progress is a true/false flag that 
107     indicates if the function should also plot its progress as the 
108     learning happens. This is set to false by default. runkMeans returns 
109     centroids, a Kxn matrix of the computed centroids and idx, a m x 1 
110     vector of centroid assignments (i.e. each entry in range [1..K]).
111     """
112     (m,n) = shape(X)
113     K = shape(initial_centroids)[0]
114     centroids = initial_centroids   
115     clusterAssment = zeros((m,2))
116     
117     #Run K-Means
118     for i in range(max_iters):
119         clusterAssment = findClosestCentroids(X, centroids)
120         centroids = computeCentroids(X, clusterAssment, K);
121     
122     return centroids, clusterAssment
123 
124 def main():    
125     K =5
126     max_iters = 10
127     dataSet =  loadDataSet('E://PythonSpace//TextClustering//data//test2.txt')  
128     X = array(dataSet)
129     X = (X - mean(X)) / std(X)
130     
131     initial_centroids = kMeansInitCentroids(X, K)
132     myCentroids, clusterAssment = runkMeans(X, initial_centroids, max_iters,False);
133     print "-------------------------------------"
134     show(X, K, myCentroids, clusterAssment)
135     
136 main()
参考了Andrew Ng的Machine Learning Assignment(https://github.com/rieder91/MachineLearning/blob/master/Exercise%207/ex7/runkMeans.m)
以及博文http://www.cnblogs.com/MrLJC/p/4127553.html
运行结果：