愚人节作业

1.X轴的分割,用了cut,groupby函数,这个cut可以做成任意多份,得到近似的函数分布
2.作plot(x,y)与hist直方图,可以看到图像还是很接近的
3.用KL散度刻画两个分布的差异,并以list形式输出
4.对list中按从大到小排列,可以得到feature的差异性,也就是统计量角度的重要性排序
优点:对于之后给定的一组特征,不管多大,都可以得到他们的重要性排序
改进:刚装的mysqldb,要学习如何用python直接操作mysql
# _*_ coding :utf8 _*_
from __future__ import division
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np
import MySQLdb
# conn=MySQLdb.connect('localhost','root','','qwe')
# cur = conn.cursor()
#
# aa=cur.execute("select * from unpaid")
# print aa
# info = cur.fetchmany(aa)[0]
# # for ii in info:
# # print ii
# cur.close()
# conn.commit()
# conn.close()

passed=pd.read_csv('C:UsersjiejiaoDesktoppass.csv')
unpaid=pd.read_csv('C:UsersjiejiaoDesktopunpaid.csv')
paid=pd.read_csv('C:UsersjiejiaoDesktoppaid.csv')

feature=pd.read_csv('C:UsersjiejiaoDesktoppaid.csv')
feature=feature.dropna(how='all',axis=1)
feature=feature.drop(['uid','status','f50','f7'],axis=1)

columns1=feature.columns
columns2=[]
columns3=[]
for name in columns1:
grouped=paid[name].groupby(paid[name])
if len(grouped)>5:
# print name
columns2.append(name)
else:
columns3.append(name)
print 'columns2:',columns2
print 'columns3:',columns3
feature1=[]
feature2=[]
for name in columns2:
unpaid[name]=unpaid[name].fillna(unpaid[name].mean())
paid[name]=paid[name].fillna(paid[name].mean())
a=unpaid[name]-np.mean(unpaid[name])
unpaid[name]=a/np.sqrt(np.sum(a**2))
b=paid[name]-np.mean(paid[name])
paid[name]=b/np.sqrt(np.sum(b**2))

cutpoint=[]
for i in range(1001):
m = min(unpaid[name].min(), paid[name].min())
M = max(unpaid[name].max(), paid[name].max())
# print m, M
d=(M-m)/1000.0
c=m+i*d
cutpoint.append(c)
# print cutpoint

grouplabel=range(1000)
# print grouplabel
# print paid[name]
# print unpaid[name]
paid['numgroup']=pd.cut(paid[name],cutpoint,labels=grouplabel)
unpaid['numgroup']=pd.cut(unpaid[name],cutpoint,labels=grouplabel)
# print unpaid
# print paid
Np=[]
Nu=[]
Np.append(paid[name].groupby(paid['numgroup']).count())
Nu.append(unpaid[name].groupby(unpaid['numgroup']).count())
Nu=np.array(Nu)
Nu=(Nu/Nu.sum()).transpose()
Np=np.array(Np)
Np=(Np/Np.sum()).transpose()
Nu=Nu+10**(-6)
Np = Np + 10**(-6)

# plt.subplot(221)
# plt.title(name)
# plt.plot(grouplabel,Nu,color='g')
# plt.subplot(222)
# plt.title('paid')
# plt.plot(grouplabel,Np,color='b')
#
# plt.subplot(223)
# plt.title(name)
# unpaid[name].hist(normed=True,color='k',alpha=0.5,bins=50)
# plt.subplot(224)
# plt.title('paid')
# paid[name].hist(normed=True,color='b',alpha=0.5,bins=44)
# plt.show()

def asymetricKL(P,Q):
t=[math.log(x) for x in (P/Q)]
return np.multiply(t,P.transpose()).sum()

tt= (asymetricKL(Nu,Np)+asymetricKL(Np,Nu))/2.0
feature1.append(tt)
print feature1
feature1_diff=pd.DataFrame(feature1,index=columns2)
#
idx=[]
for name in columns3:
idx1 = passed[name]
idx1= list(set(idx1))
idx=[x for x in idx1 if str(x)!='nan']
# print idx
Np=[]
Nu=[]
Np= paid[name].groupby(passed[name]).count()
Nu= unpaid[name].groupby(passed[name]).count()
s_unpaid=pd.DataFrame(Nu,index=idx)
s_unpaid[np.isnan(s_unpaid)]=0
s_paid=pd.DataFrame(Np,index=idx)
s_paid[np.isnan(s_paid)]=0
Nu = np.array(s_unpaid[name])
Np =np.array(s_paid[name])
# print Nu,Np
Nu=Nu+10**(-6)
Np = Np + 10**(-6)
# print Nu,Np
# plt.subplot(223)
# plt.title(name)
# unpaid[name].hist(normed=True,color='k',alpha=0.5,bins=50)
# plt.subplot(224)
# plt.title('paid')
# paid[name].hist(normed=True,color='b',alpha=0.5,bins=44)
# plt.show()

def asymetricKL(P,Q):
t=[math.log(x) for x in (P/Q)]
return np.multiply(t,P.transpose()).sum()

tt= (asymetricKL(Nu,Np)+asymetricKL(Np,Nu))/2.0
feature2.append(tt)
feature2_diff=pd.DataFrame(feature2,index=columns3)
print feature2_diff
 
原文地址:https://www.cnblogs.com/jojo123/p/6656111.html