naive bayesian classifier
__author__ = 'HM' f = open('data.txt','r') first_line = f.readline().split() attributes = first_line[:-1] attr_len = len(attributes) classname = first_line[-1] data_set_raw = [] class_label_pool = set() attribute_discrete_pool = {} for line in f: raw_data = line.split() # new_record = {classname:raw_data[-1]} new_record = {'class_label':raw_data[-1]} class_label_pool.add(raw_data[-1]) for i in xrange(attr_len): attribute_name = attributes[i] new_record[attribute_name] = raw_data[i] attribute_discrete_pool[attribute_name]=attribute_discrete_pool.get(attribute_name,set()).union(set([raw_data[i]])) data_set_raw.append(new_record) for d in data_set_raw: print d def train_classifier(data): attr_value_count = {}#{'yes':{'credit_rating':{'fair':10,'excellent':30},}} class_value_count = {}#{'yes':10,'no':20} #initial attr_value_count for c in class_label_pool: attr_value_count[c]=dict() for a in attributes: attr_value_count[c][a]=dict() for attr_value in attribute_discrete_pool[a]: attr_value_count[c][a][attr_value] = 0#not Use Laplacian correction(+1) print attr_value_count #initial class_value_count for c in class_label_pool: class_value_count[c] = 0 for d in data: for a in attributes: attr_value_count[d['class_label']][a][d[a]] += 1 class_value_count[d['class_label']] += 1 return attr_value_count,class_value_count def predict(data,dataset_len,attr_value_count,class_value_count): print attr_value_count print class_value_count p_c_x_table = {} for c in class_label_pool: p_c = class_value_count[c]/float(dataset_len) print 'pc',p_c p_x_c = 1 for key in data: p_x_c *= attr_value_count[c][key][data[key]]/float(class_value_count[c]) print 'p_x_c',p_x_c,data[key],attr_value_count[c][key][data[key]] p_c_x = p_x_c*p_c p_c_x_table[c] = p_c_x print p_c_x_table d = {'age':'<=30','income':'medium','student':'yes','credit_rating':'fair'} predict(d,len(data_set_raw),*train_classifier(data_set_raw))
dataset:
age income student credit_rating buys_compute <=30 high no fair no <=30 high no excellent no 31…40 high no fair yes >40 medium no fair yes >40 low yes fair yes >40 low yes excellent no 31…40 low yes excellent yes <=30 medium no fair no <=30 low yes fair yes >40 medium yes fair yes <=30 medium yes excellent yes 31…40 medium no excellent yes 31…40 high yes fair yes >40 medium no excellent no
备注:表示数据的方法有点麻烦(各种字典套字典。。。),找个方法优化之。
版权声明:本文为博主原创文章,未经博主允许不得转载。