sklearn 朴素贝叶斯
朴素贝叶斯的sklearn
1 from sklearn import datasets 2 iris = datasets.load_iris() 3 from sklearn.naive_bayes import GaussianNB 4 gnb = GaussianNB() 5 y_pred = gnb.fit(iris.data, iris.target).predict(iris.data) 6 print("Number of mislabeled points out of a total %d points : %d" 7 % (iris.data.shape[0],(iris.target != y_pred).sum())) 8 9 #贝叶斯估计的,带平滑,默认alpha为1 即拉普拉斯平滑 10 from sklearn.naive_bayes import MultinomialNB 11 clf = MultinomialNB(alpha=0.5) 12 y_pred1=clf.fit(iris.data, iris.target).predict(iris.data) 13 print("Number of mislabeled points out of a total %d points : %d" 14 % (iris.data.shape[0],(iris.target != y_pred1).sum())) 15 16 a1=clf.fit(iris.data, iris.target).predict_proba(iris.data) 17 a2=clf.fit(iris.data, iris.target).predict_log_proba(iris.data)
pr 曲线 roc曲线 auc得分
print(__doc__) import sys import numpy as np import matplotlib.pyplot as plt from sklearn.metrics import roc_curve, auc, precision_recall_curve, roc_auc_score inputfile = sys.argv[1] label_list = [] score_list = [] with open(inputfile, 'r') as fd: for line in fd: fs = line.strip().split(' ') label = int(fs[0]) score = float(fs[1]) label_list.append(label) score_list.append(score)
#roc曲线,假阳率fpr,真阳率tpr fpr, tpr, _ = roc_curve(label_list, score_list) auc = auc(fpr, tpr) #计算auc得分 auc_score=roc_auc_score(label_list, score_list) #pr曲线 precision, recall, _ = precision_recall_curve(label_list, score_list) ############################################################################## # Plot of a ROC curve for a specific class plt.figure() plt.plot(fpr, tpr) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC curve (auc = %.2f)' % auc) plt.legend(loc="lower right") plt.show() plt.figure() plt.plot(recall, precision) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('recall') plt.ylabel('precision') plt.title('Precision-Recall curve') plt.legend(loc="lower right") plt.show()
计算auc的
import numpy as np import matplotlib.pyplot as plt filepath='/home/hadoop/bigdata/nb/roi_auc_test/auc.raw' f=open(filepath) a=[] b=[] for line in f: ss=line.split('\t') a.append([float(ss[0]),float(ss[1])]) f.close() a=np.array(a) data=a.T data = data[:,data[1].argsort()] b1=data[0] b2=data[1] a=0.0 x=0.0 y=0.0 for i in range(len(b1)): if b1[i]==-1: x+=1 a+=y elif b1[i]==1: y+=1 else:pass print(1-a/(x*y))
根据定义计算auc
import numpy as np import matplotlib.pyplot as plt filepath='/home/hadoop/bigdata/nb/roi_auc_test/auc.raw' f=open(filepath) a=[] for line in f: ss=line.split('\t') a.append([float(ss[0]),float(ss[1])]) f.close() a=np.array(a) b=a.T b1=b[0] b2=b[1] xmin,xmax=np.min(b2),np.max(b2) n=5000 step=(xmax-xmin)/n x1=[] y1=[] for i in range(n+1): x= xmin + i * step index=np.where(b2<=x) tn=np.sum(b1[index]==-1) fn=np.sum(b1[index]==1) index1=np.where(b2>x) fp=np.sum(b1[index1]==-1) tp=np.sum(b1[index1]==1) yy=tp*1.0/(tp+fn) xx=fp*1.0/(fp+tn) x1.append(xx) y1.append(yy) data=np.array([x1,y1]) data = data[:,data[0].argsort()] area=0.0 for i in range(data.shape[1]-1): area += (data[0][i+1]-data[0][i])*(data[1][i]+data[1][i+1])/2 print(area)