KMeans的数据压缩

import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
import mahotas as mh

original_img = np.array(mh.imread('Penguins.jpg'), dtype=np.float64) / 255


'''
>>> original_img
array([[[ 0.45490196, 0.68627451, 0.81960784],
[ 0.4627451 , 0.68235294, 0.81960784],
[ 0.4627451 , 0.68235294, 0.81960784],
..., 
[ 0.34901961, 0.62352941, 0.81568627],
[ 0.35686275, 0.62352941, 0.81568627],
[ 0.35686275, 0.62352941, 0.81568627]],

[[ 0.45490196, 0.69019608, 0.82352941],
[ 0.45490196, 0.68235294, 0.82745098],
[ 0.45882353, 0.68627451, 0.83137255],
..., 
[ 0.34117647, 0.63137255, 0.80784314],
[ 0.34117647, 0.63529412, 0.8 ],
[ 0.34117647, 0.63529412, 0.8 ]],

[[ 0.4627451 , 0.69411765, 0.82745098],
[ 0.45882353, 0.68627451, 0.83137255],
[ 0.45882353, 0.68627451, 0.83137255],
..., 
[ 0.33333333, 0.63921569, 0.78823529],
[ 0.3372549 , 0.64313725, 0.78431373],
[ 0.3372549 , 0.64313725, 0.78431373]],

..., 
[[ 0.34509804, 0.4745098 , 0.35294118],
[ 0.50588235, 0.54901961, 0.5254902 ],
[ 0.76078431, 0.79215686, 0.56078431],
..., 
[ 0.43921569, 0.54117647, 0.5372549 ],
[ 0.39607843, 0.49803922, 0.50196078],
[ 0.34117647, 0.42352941, 0.42745098]],

[[ 0.31764706, 0.44705882, 0.40392157],
[ 0.37647059, 0.46666667, 0.49803922],
[ 0.30196078, 0.40392157, 0.29019608],
..., 
[ 0.44313725, 0.52156863, 0.51372549],
[ 0.43921569, 0.50980392, 0.51764706],
[ 0.36078431, 0.45882353, 0.44705882]],

[[ 0.30588235, 0.40784314, 0.37254902],
[ 0.31372549, 0.42352941, 0.47058824],
[ 0.31372549, 0.39607843, 0.36862745],
..., 
[ 0.40784314, 0.50588235, 0.48235294],
[ 0.41568627, 0.49803922, 0.50196078],
[ 0.33333333, 0.40392157, 0.40392157]]])
>>> original_img.shape[0] #宽度
434
>>> original_img.shape[1] #高度
1024
>>> original_img.shape[2]
3
'''

original_dimensions = tuple(original_img.shape)


'''
>>> original_dimensions
(434, 1024, 3)
'''

width, height, depth = tuple(original_img.shape)
#文档说，是高度，宽度，Must be of shape (h,w,3)
#http://mahotas.readthedocs.org/en/latest/api.html
image_flattened = np.reshape(original_img, (width * height, depth))
'''
>>> image_flattened.shape
(444416, 3)
'''
#随机选取1000个颜色点
image_array_sample = shuffle(image_flattened, random_state=0)[:1000]
'''
>>> image_array_sample
array([[ 0.2745098 , 0.37254902, 0.4 ],
[ 0.41568627, 0.6627451 , 0.82352941],
[ 0.64705882, 0.75686275, 0.94117647],
..., 
[ 0.11764706, 0.25490196, 0.33333333],
[ 0.7372549 , 0.79607843, 0.96470588],
[ 0.7254902 , 0.78823529, 0.92156863]])
>>> image_array_sample.shape
(1000, 3)
'''

#1000个采样点，64个聚簇
estimator = KMeans(n_clusters=64, random_state=0)
estimator.fit(image_array_sample)

cluster_assignments = estimator.predict(image_flattened)

'''
>>> cluster_assignments
array([ 6, 6, 6, ..., 14, 14, 14])
>>> cluster_assignments.shape
(444416,)
>>> 
这样就给每一个颜色值分配了一个颜色标签（这样的标签共有64个）
'''

compressed_palette = estimator.cluster_centers_
compressed_img = np.zeros((width, height, compressed_palette.shape[1]))

label_idx = 0
for i in range(width):
   for j in range(height):
      compressed_img[i][j] = compressed_palette[cluster_assignments[label_idx]]   #根据标签，获得颜色值
      label_idx += 1

plt.subplot(121) #一行两列第一个位置
plt.title('Original Image')
plt.imshow(original_img)

plt.axis('off')
plt.subplot(122) #很神奇的地方，一行两列，第二个位置
plt.title('Compressed Image')
plt.imshow(compressed_img)
plt.axis('off')
plt.show()