Python3 贝叶斯分类
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Tue Jan 16 20:11:07 2018 4 5 @author: markli 6 """ 7 import numpy as np; 8 import pandas as pd; 9 10 """ 11 贝叶斯分类 12 训练数据格式 13 X = [x1,x2,...xm]; m*n 14 xi = [xi1,xi2,...xin].T 15 Y = [y1,y2,...ym];1*m 16 """ 17 18 class Bayes: 19 def __init__(self,lamda,region): 20 """ 21 lamda 贝叶斯修正参数 22 region 特征属性取值域,类标签取值域 23 例如两个特征取值范围分别为,A1=[1,2,3],A2=['S','M','L'] 24 类标签取值:C=[1,-1],region=[A1,A2,C] 25 """ 26 self.lamda = lamda; 27 #存放类标签域 28 self.Y = region[-1]; 29 #存放特征取值域 30 self.X = region[:-1]; 31 32 #存放先验概率 P(Y = Ck) 33 self.PrioPro = np.zeros((1,len(region[-1]))); 34 35 #存放条件概率 P(Xj = ajl | Y = Ck) 36 self.ConditionalPro = []; 37 for i in range(len(region)-1): 38 cp = np.zeros((len(region[-1]),len(region[i]))); 39 self.ConditionalPro.append(cp); 40 41 def fit(self,TrainData): 42 """ 43 计算先验概率和条件概率,建立模型 44 TrainData 为二维数组 45 TrainData 列的顺序与region中特征属性顺序一致 46 TrainData 最后一列为类别 47 """ 48 N = len(TrainData); 49 K = len(self.Y); 50 TrainData = TrainData.astype(np.str); 51 52 NumofCk = pd.value_counts(TrainData[:,-1], sort=True); #Series 类型 53 CountOfCk = [NumofCk[ck] for ck in self.Y]; #list类型 54 self.PrioPro = [(ck+self.lamda) / (N + K * self.lamda) for ck in CountOfCk]; 55 56 j=0; 57 for ck in self.Y: 58 #选出类别为Ck的数据 59 DataofCk = TrainData[np.where(TrainData[:,-1]==ck)]; 60 n = len(DataofCk); 61 #选出第i个特征的数据 62 for i in range(len(self.X)): 63 DataofCkandXi = DataofCk[:,i]; 64 Numofaj = pd.value_counts(DataofCkandXi,sort=True); #为第i个特征的每个特征值计数 65 Countofaj = [Numofaj[aj] for aj in self.X[i]]; 66 S = len(self.X[i]); 67 self.ConditionalPro[i][j] = [(aj+self.lamda) / (n+S * self.lamda) for aj in Countofaj]; 68 j = j+1; 69 70 def predict(self,TestData): 71 """ 72 预测实例,为其分类 73 测试数据没有类别列,其余数据格式与训练数据格式一致 74 """ 75 predictY = []; 76 for i in range(len(TestData)): 77 x = TestData[i]; 78 y = self.GetLable(x); 79 predictY.append(y); 80 return predictY; 81 82 def GetLable(self,x): 83 """ 84 输入一个测试实例x ,输出使其后验概率最大的类别y 85 """ 86 pro = []; 87 n = len(x); 88 for j in range(len(self.Y)): 89 p = 1; 90 for i in range(n): 91 feature = self.ConditionalPro[i]; 92 fi = self.X[i] #获得第i个特征的值域 93 index = fi.index(x[i]); 94 p = p * feature[j][index]; 95 96 p = p * self.PrioPro[j]; 97 pro.append(p); 98 99 y = self.Y[np.argmax(pro)]; 100 101 return y; 102 103
测试代码
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Jan 17 15:29:14 2018 4 5 @author: markli 6 """ 7 8 import numpy as np; 9 import pandas as pd; 10 from BayesClass import Bayes 11 12 A1 = ['1','2','3']; 13 A2 = ['S','M','L']; 14 C = ['1','-1']; 15 16 Data = np.array([[1,'S',-1],[1,'M',-1],[1,'M',1],[1,'S',1],[1,'S',-1], 17 [2,'S',-1],[2,'M',-1],[2,'M',1],[2,'L',1],[2,'L',1], 18 [3,'L',1],[3,'M',1],[3,'M',1],[3,'L',1],[3,'L',-1]]); 19 test = np.array([[2,'S']]); 20 B = Bayes(0,[A1,A2,C]); 21 B.fit(Data); 22 y = B.predict(test); 23 print(y);