机器学习实战-学习笔记-第十三章

1.通过Window10“所有应用”->"Anaconda(64bit)"->"Anaconda Command Prompt",启动pyhon,

2.在Python REPL中输入脚本

def loadDataSet(fileName, delim='	'):
    fr = open(fileName)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]
    datArr = [map(float,line) for line in stringArr]
    return mat(datArr)

def pca(dataMat, topNfeat=9999999):
    meanVals = mean(dataMat, axis=0)
    meanRemoved = dataMat - meanVals #remove mean
    covMat = cov(meanRemoved, rowvar=0)
    eigVals,eigVects = linalg.eig(mat(covMat))
    eigValInd = argsort(eigVals)            #sort, sort goes smallest to largest
    eigValInd = eigValInd[:-(topNfeat+1):-1]  #cut off unwanted dimensions
    redEigVects = eigVects[:,eigValInd]       #reorganize eig vects largest to smallest
    lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions
    reconMat = (lowDDataMat * redEigVects.T) + meanVals
    return lowDDataMat, reconMat

运行结果如下:

Python 2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 16:44:52) [MSC v.1500 64 bit (AMD64)] on win32
Type "help", "copyright", "credits" or "license" for more information.
Anaconda is brought to you by Continuum Analytics.
Please check out: http://continuum.io/thanks and https://binstar.org
>>> from numpy import *
>>>
>>> def loadDataSet(fileName, delim='	'):
...     fr = open(fileName)
...     stringArr = [line.strip().split(delim) for line in fr.readlines()]
...     datArr = [map(float,line) for line in stringArr]
...     return mat(datArr)
...
>>> def pca(dataMat, topNfeat=9999999):
...     meanVals = mean(dataMat, axis=0)
...     meanRemoved = dataMat - meanVals #remove mean
...     covMat = cov(meanRemoved, rowvar=0)
...     eigVals,eigVects = linalg.eig(mat(covMat))
...     eigValInd = argsort(eigVals)            #sort, sort goes smallest to largest
...     eigValInd = eigValInd[:-(topNfeat+1):-1]  #cut off unwanted dimensions
...     redEigVects = eigVects[:,eigValInd]       #reorganize eig vects largest to smallest
...     lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions
...     reconMat = (lowDDataMat * redEigVects.T) + meanVals
...     return lowDDataMat, reconMat

验证:

>>> dataMat = loadDataSet('F:\studio\MachineLearningInAction\ch13\testSet.txt')
>>> shape(dataMat)
(1000L, 2L)
>>> lowMat, reconMat = pca(dataMat, 1)
>>> shape(lowMat)
(1000L, 1L)
>>> import matplotlib
>>> import matplotlib.pyplot as plt>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.scatter(dataMat[:,0].flatten().A[0], dataMat[:,1].flatten().A[0], marker='^', s=90)
<matplotlib.collections.PathCollection object at 0x0000000009F556D8>
>>> ax.scatter(reconMat[:,0].flatten().A[0], reconMat[:,1].flatten().A[0], marker='^', s=90)
<matplotlib.collections.PathCollection object at 0x0000000009F55710>
>>> plt.show()
原文地址:https://www.cnblogs.com/littlesuccess/p/5096193.html