Python3 贝叶斯分类

  1 # -*- coding: utf-8 -*-
  2 """
  3 Created on Tue Jan 16 20:11:07 2018
  4 
  5 @author: markli
  6 """
  7 import numpy as np;
  8 import pandas as pd;
  9 
 10 """
 11 贝叶斯分类
 12 训练数据格式
 13 X = [x1,x2,...xm]; m*n
 14 xi = [xi1,xi2,...xin].T
 15 Y = [y1,y2,...ym];1*m
 16 """
 17 
 18 class Bayes:
 19     def __init__(self,lamda,region):
 20         """
 21         lamda 贝叶斯修正参数
 22         region 特征属性取值域,类标签取值域
 23         例如两个特征取值范围分别为,A1=[1,2,3],A2=['S','M','L']
 24         类标签取值:C=[1,-1],region=[A1,A2,C]
 25         """
 26         self.lamda = lamda;
 27         #存放类标签域
 28         self.Y = region[-1];
 29         #存放特征取值域
 30         self.X = region[:-1];
 31         
 32         #存放先验概率 P(Y = Ck)
 33         self.PrioPro = np.zeros((1,len(region[-1])));
 34         
 35         #存放条件概率 P(Xj = ajl | Y = Ck)
 36         self.ConditionalPro = [];
 37         for i in range(len(region)-1):
 38             cp = np.zeros((len(region[-1]),len(region[i])));
 39             self.ConditionalPro.append(cp);
 40     
 41     def fit(self,TrainData):
 42         """
 43         计算先验概率和条件概率,建立模型
 44         TrainData 为二维数组
 45         TrainData 列的顺序与region中特征属性顺序一致
 46         TrainData 最后一列为类别
 47         """
 48         N = len(TrainData);
 49         K = len(self.Y);
 50         TrainData = TrainData.astype(np.str);
 51         
 52         NumofCk = pd.value_counts(TrainData[:,-1], sort=True); #Series 类型
 53         CountOfCk = [NumofCk[ck] for ck in self.Y]; #list类型
 54         self.PrioPro = [(ck+self.lamda) / (N + K * self.lamda) for ck in CountOfCk];
 55         
 56         j=0;
 57         for ck in self.Y:
 58             #选出类别为Ck的数据
 59             DataofCk = TrainData[np.where(TrainData[:,-1]==ck)]; 
 60             n = len(DataofCk);
 61             #选出第i个特征的数据
 62             for i in range(len(self.X)):
 63                 DataofCkandXi = DataofCk[:,i];
 64                 Numofaj = pd.value_counts(DataofCkandXi,sort=True); #为第i个特征的每个特征值计数
 65                 Countofaj = [Numofaj[aj] for aj in self.X[i]];
 66                 S = len(self.X[i]);
 67                 self.ConditionalPro[i][j] = [(aj+self.lamda) / (n+S * self.lamda) for aj in Countofaj];
 68             j = j+1;
 69         
 70     def predict(self,TestData):
 71         """
 72         预测实例,为其分类
 73         测试数据没有类别列,其余数据格式与训练数据格式一致
 74         """
 75         predictY = [];
 76         for i in range(len(TestData)):
 77             x = TestData[i];
 78             y = self.GetLable(x);
 79             predictY.append(y);
 80         return predictY;
 81     
 82     def GetLable(self,x):
 83         """
 84         输入一个测试实例x ,输出使其后验概率最大的类别y
 85         """
 86         pro = [];
 87         n = len(x);
 88         for j in range(len(self.Y)):
 89             p = 1;
 90             for i in range(n):
 91                 feature = self.ConditionalPro[i];
 92                 fi = self.X[i] #获得第i个特征的值域
 93                 index = fi.index(x[i]);
 94                 p = p * feature[j][index];
 95                 
 96             p = p * self.PrioPro[j];
 97             pro.append(p);
 98         
 99         y = self.Y[np.argmax(pro)];
100             
101         return y;
102         
103     

测试代码

 1 # -*- coding: utf-8 -*-
 2 """
 3 Created on Wed Jan 17 15:29:14 2018
 4 
 5 @author: markli
 6 """
 7 
 8 import numpy as np;
 9 import pandas as pd;
10 from BayesClass import Bayes
11 
12 A1 = ['1','2','3'];
13 A2 = ['S','M','L'];
14 C = ['1','-1'];
15 
16 Data = np.array([[1,'S',-1],[1,'M',-1],[1,'M',1],[1,'S',1],[1,'S',-1],
17                  [2,'S',-1],[2,'M',-1],[2,'M',1],[2,'L',1],[2,'L',1],
18                  [3,'L',1],[3,'M',1],[3,'M',1],[3,'L',1],[3,'L',-1]]);
19 test = np.array([[2,'S']]);
20 B = Bayes(0,[A1,A2,C]);
21 B.fit(Data);
22 y = B.predict(test);
23 print(y);

 

posted on 2018-01-17 19:35  FightLi  阅读(251)  评论(0编辑  收藏  举报