sklearn 朴素贝叶斯

朴素贝叶斯的sklearn
 1 from sklearn import datasets
 2 iris = datasets.load_iris()
 3 from sklearn.naive_bayes import GaussianNB
 4 gnb = GaussianNB()
 5 y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)
 6 print("Number of mislabeled points out of a total %d points : %d"
 7        % (iris.data.shape[0],(iris.target != y_pred).sum()))
 8 
 9 #贝叶斯估计的，带平滑，默认alpha为1 即拉普拉斯平滑
10 from sklearn.naive_bayes import MultinomialNB
11 clf = MultinomialNB(alpha=0.5)
12 y_pred1=clf.fit(iris.data, iris.target).predict(iris.data)
13 print("Number of mislabeled points out of a total %d points : %d"
14        % (iris.data.shape[0],(iris.target != y_pred1).sum()))
15 
16 a1=clf.fit(iris.data, iris.target).predict_proba(iris.data)
17 a2=clf.fit(iris.data, iris.target).predict_log_proba(iris.data)

pr 曲线 roc曲线 auc得分

print(__doc__)

import sys
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, roc_auc_score

inputfile = sys.argv[1]

label_list = []
score_list = []
with open(inputfile, 'r') as fd:
    for line in fd:
        fs = line.strip().split('    ')
        label = int(fs[0])
        score = float(fs[1])
        label_list.append(label)
        score_list.append(score)

#roc曲线，假阳率fpr，真阳率tpr
fpr, tpr, _ = roc_curve(label_list, score_list)
auc = auc(fpr, tpr)
#计算auc得分
auc_score=roc_auc_score(label_list, score_list)
#pr曲线
precision, recall, _ = precision_recall_curve(label_list, score_list)

##############################################################################
# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve (auc = %.2f)' % auc)
plt.legend(loc="lower right")
plt.show()

plt.figure()
plt.plot(recall, precision)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('recall')
plt.ylabel('precision')
plt.title('Precision-Recall curve')
plt.legend(loc="lower right")
plt.show()

计算auc的

import numpy as np
import matplotlib.pyplot as plt
filepath='/home/hadoop/bigdata/nb/roi_auc_test/auc.raw'
f=open(filepath)
a=[]
b=[]
for line in f:
    ss=line.split('	')
    a.append([float(ss[0]),float(ss[1])])
f.close()
a=np.array(a)
data=a.T
data = data[:,data[1].argsort()]
b1=data[0]
b2=data[1]
a=0.0
x=0.0
y=0.0
for i in range(len(b1)):
    if b1[i]==-1:
        x+=1
        a+=y
    elif b1[i]==1:
        y+=1
    else:pass
print(1-a/(x*y))

根据定义计算auc

import numpy as np
import matplotlib.pyplot as plt
filepath='/home/hadoop/bigdata/nb/roi_auc_test/auc.raw'
f=open(filepath)
a=[]
for line in f:
    ss=line.split('	')
    a.append([float(ss[0]),float(ss[1])])
f.close()
a=np.array(a)
b=a.T
b1=b[0]
b2=b[1]
xmin,xmax=np.min(b2),np.max(b2)
n=5000
step=(xmax-xmin)/n
x1=[]
y1=[]
for i in range(n+1):
    x= xmin + i * step
    index=np.where(b2<=x)
    tn=np.sum(b1[index]==-1)
    fn=np.sum(b1[index]==1)
    index1=np.where(b2>x)
    fp=np.sum(b1[index1]==-1)
    tp=np.sum(b1[index1]==1)
    yy=tp*1.0/(tp+fn)
    xx=fp*1.0/(fp+tn)
    x1.append(xx)
    y1.append(yy)
data=np.array([x1,y1])
data = data[:,data[0].argsort()]
area=0.0
for i in range(data.shape[1]-1):
    area += (data[0][i+1]-data[0][i])*(data[1][i]+data[1][i+1])/2
print(area)