机器学习实战-随机森林二分类问题
随机森林
概论
前提
Random Forest:可以理解为Bagging with CARTS.
Bagging是bootstrap aggregating(引导聚集算法)的缩写。
CART(classification and regression Tree)分类和回归树,二分类树。
这里涉及到集成式学习的概念,集成学习可以分为Bagging和Boosting.
Bagging:自放回式采样,一种弱分类器,采用少数服从多数的机制,并行式运算。
Boosting:自适应的集成学习,顺序迭代,串行式运算。代表算法AdaBoost(Adaptive Boosting)
CART采用分而治之的策略。
回归树:采用分治策略,对于无法用唯一的全局线性回归来优化的目标进行分而治之,进而取得比较准确的结果。但分段后取均值并不是一个明智的选择,可以考虑将叶节点设置成一个线性函数,即分段线性模型树。
算法介绍链接:https://www.tuicool.com/articles/iiUfeim
数据集出处:https://archive.ics.uci.edu/ml/datasets/Connectionist+Bench+(Sonar,+Mines+vs.+Rocks)
python三维向量转二维向量
运行:print(sum([[[1,2,3],[4,5,5]],[[1,2,3],[4,5,5]]],[]))
输出:[[1, 2, 3], [4, 5, 5], [1, 2, 3], [4, 5, 5]]
python中多个实参,放到一个元组里面,以*开头,可以传多个参数
*args:(表示的就是将实参中按照位置传值,多出来的值都给args,且以元祖的方式呈现)
分类回归效果的判断指标:
Information Entropy(信息熵)、Gini Index(基尼指数)、Gini Split(基尼分割)、Misclassification Error(错误分类)
以上判断数值越小,模型的效果越好
Information Gain(信息增益),数值越大,效果越好
实战
数据集说明
【sonar-all-data.csv】
60 个输入变量表示声纳从不同角度返回的强度。这是一个二元分类问题(binary classification problem),要求模型能够区分出岩石和金属柱体的不同材质和形状,总共有 208 个观测样本。
附代码
#coding = utf-8 from random import seed from random import randrange from csv import reader from math import sqrt from math import log class randomForest: def __init__(self): print('randomforest==start==') print(seed(1)) #导入数据 def load_csv(self,filename): dataset = list() with open(filename, 'r') as file: csv_reader = reader(file) for row in csv_reader: if not row: continue dataset.append(row) return dataset #Convert string column to integer def str_column_to_float(self,dataset,column): for row in dataset: row[column] = float(row[column].strip()) #Convert String column to integer def str_column_to_int(self,dataset,column): class_values = [row[column] for row in dataset] unique = set(class_values) lookup = dict() #enumerate()用于将一个可遍历的数据对象组合成一个索引序列 for i, value in enumerate(unique): lookup[value] = i for row in dataset: row[column] = lookup[row[column]] return lookup #Create a random subsample from the dataset with replacement #创建随机子样本 def subsample(self,dataset,ratio): sample = list() #round()方法返回浮点数x的四舍五入值 n_sample = round(len(dataset)* ratio) # print(n_sample) while len(sample)< n_sample: #有放回的随机采样,有一些样本被重复采样,从而在训练集中多次出现,有的则从未在训练集中出现。 #此方法即为自助采样法,从而保证每颗决策树训练集的差异性 index = randrange(len(dataset)) sample.append(dataset[index]) return sample #Split a dataset based on an attribute and an attribute value #根据特征和特征值分割数据集 def test_split(self,index,value,dataset): left, right = list(), list() for row in dataset: if row[index] < value: left.append(row) else: right.append(row) return left,right #计算基尼指数 def gini_index(self, groups,class_values): gini = 0.0 # print(groups) # print(len(class_values))输出:166 for class_value in class_values: for group in groups: size = len(group) if size == 0: continue # print(class_value)输出:{'M'} # print(group) # exit() # print(size)输出:109 #list count()统计某个元素出现在列表中的次数 proportion = [row[-1] for row in group].count(class_value)/float(size) # print(proportion) gini += (proportion * (1.0 - proportion)) # print(gini) return gini #Select the best split point for a dataset #找出分割数据集的最优特征,得到最优的特征index,特征值row[index],以及分割完的数据groups(left,right) def get_split(self,dataset,n_features): #class_values =[0,1] class_values = list(set(row[-1]) for row in dataset) b_index, b_value, b_score,b_group = 999,999,999,None features = list() #n_features特征值 while len(features) < n_features: #往features添加n_features个特征(n_features等于特征数的根号),特征索引从dataset中随机取 index = randrange(len(dataset[0]) - 1) if index not in features: features.append(index) #在n_features个特征中选出最优的特征索引,并没有遍历所有特征,从而保证每个决策树的差异 for index in features: for row in dataset: #groups = (left, right);row[index]遍历每一行index索引下的特征值作为分类值values, #找出最优的分类特征和特征值 groups = self.test_split(index,row[index],dataset) # print(groups)输出格式:[[]],[[]] gini = self.gini_index(groups, class_values) # print(gini) if gini < b_score: #最后得到最优的分类特征b_index,分类特征值b_value,分类结果b_groups。 b_value为分错的代价成本 b_index,b_value,b_score,b_groups = index, row[index], gini, groups return {'index':b_index, 'value':b_value, 'groups':b_groups} #创建一个终端节点 #输出group中出现次数最多的标签 def to_terminal(self,group): outcomes = [row[-1] for row in group] #max()函数中,当key函数不为空时,就以key的函数对象为判断的标准 # print(outcomes) return max(set(outcomes), key = outcomes.count) #创建子分割器,递归分类,直到分类结束 def split(self, node, max_depth, min_size, n_features, depth): #max_depth = 10 ,min_size = 1, n_features = int(sqrt(len(dataset[0])) - 1) left,right = node['groups'] # print('node[groups]====') del(node['groups']) #检查左右分支 if not left or not right: node['left'] = node['right'] = self.to_terminal(left+right) return #检查迭代次数,表示递归十次后,若分类还没结束,则选取数据中分类标签较多的作为结果,使分类提前结束,防止过拟合。 if depth >= max_depth: node['left'], node['right'] = self.to_terminal(left), self.to_terminal(right) #加工左子树 if len(left)<= min_size: node['left'] = self.to_terminal(left) else: # print('左子树递归') # node['left']是一个字典,形式为{'index':b_index,'value':b_value,'groups':b_groups},所以node是一个多层字典 node['left'] = self.get_split(left, n_features) #递归,depth+1计算递归层数 self.split(node['left'],max_depth,min_size,n_features,depth+1) #加工右子树 if len(right) <= min_size: node['right'] = self.to_terminal(right) else: # print('右子树递归') node['right'] = self.get_split(right,n_features) self.split(node['right'],max_depth,min_size,n_features,depth+1) #build a decision tree,建立一个决策树 def build_tree(self,train,max_depth,min_size,n_features): #找出最优的分割点 root = self.get_split(train,n_features) # print(root) #创建子分类器,递归分类,直到分类结束。 self.split(root, max_depth,min_size,n_features, 1) return root # exit() #用决策树进行预测,预测模型的分类结果 def predict(self,node,row): if row[node['index']] < node['value']: if isinstance(node['left'], dict): return self.predict(node['left'], row) else: return node['left'] else: if isinstance(node['right'],dict): return self.predict(node['right'],row) else: return node['right'] #用一系列的套袋树进行预测 def bagging_predict(self, trees,row): #使用多个决策树trees对测试集test的第row行进行预测,再使用简单投票法判断出该行所属的分类 predictions = [self.predict(tree, row) for tree in trees] return max(set(predictions), key = predictions.count) #Random Forest Algorithm,随机森林算法 def random_forest(self,train, test, max_depth, min_size,sample_size,n_trees,n_features): trees = list() #n_trees表示决策树的数量 for i in range(n_trees): #随机采样,保证每颗决策树训练集的差异性 #sample_size采样速率 print('训练集长度=',len(train)) #创建随机子样本 sample = self.subsample(train, sample_size) #建立一个决策树 tree = self.build_tree(sample,max_depth,min_size,n_features) # print(tree) trees.append(tree) ##用一系列的套袋树进行预测 predictions = [self.bagging_predict(trees, row) for row in test] # print(predictions) return(predictions) # exit() #Split a dataset into k folds ''' 将数量集dataset分成n_flods份,每份包含len(dataset)/ n_folds个值,每个值由dataset数据集的内容随机产生,每个值被调用一次 ''' def cross_validation_split(self,dataset,n_folds): dataset_split = list() #复制一份dataset,防止dataset的内容改变 dataset_copy = list(dataset) #每份的数据量 fold_size = len(dataset)/n_folds # print(dataset_copy) print('每份的长度',fold_size ) print('dataset_copy长度=',len(dataset_copy)) for i in range(n_folds): #每次循环fold清零,防止重复导入dataset_split fold = list() #随机抽取数据,不断地往fold中添加数据 while len(fold) < fold_size: #指定递增基数集合中的一个随机数,基数缺省值为1 # print('dataset长度=',len(dataset_copy)) if(len(dataset_copy)==0): break index = randrange(len(dataset_copy)) #将对应索引index的内容从dataset_copy中导出,并将该内容从dataset_copy中删除。 #pop()函数用于移除列表中的一个元素,并返回该元素的值。 fold.append(dataset_copy.pop(index)) # print(len(fold)) dataset_split.append(fold) print('i===',i) #dataset分割出的n_flods个数据构成的列表,为了用于交叉验证 return dataset_split #计算精度百分比,导入实际值和预测值,计算精确度 def accuracy_metric(self, actual,predicted): correct = 0 for i in range(len(actual)): if actual[i] == predicted[i]: correct +=1 return correct/float(len(actual)) * 100.0 def evaluate_algorithm(self, dataset, algorithm, n_folds, *args): folds = self.cross_validation_split(dataset, n_folds) scores = list() #每次循环从folds取出一个fold作为测试集,其余作为训练集,遍历整个folds,实现交叉验证 for fold in folds: train_set = list(folds) train_set.remove(fold) #sum三维向量转二维数组,将多个fold组合成一个train_set列表 train_set = sum(train_set,[]) test_set = list() #fold表示从原始数据集dataset提取出来的测试集 for row in fold: row_copy = list(row) test_set.append(row_copy) row_copy[-1] = None predicted = algorithm(train_set,test_set,*args) print('交叉验证RF的预测值=',predicted) actual = [row[-1] for row in fold] accuracy = self.accuracy_metric(actual, predicted) scores.append(accuracy) return scores if __name__ == '__main__': rf = randomForest() #load data filename = 'sonar-all-data.csv' dataset = rf.load_csv(filename) # print(dataset) #整个矩阵,按列从左到右转化 for i in range(0,len(dataset[0]) - 1): #将str类型转变为float rf.str_column_to_float(dataset, i) # print(dataset) #将最后一列表示表示标签的值转化为Int类型0,1 # str_column_to_int(dataset, len(dataset[0]) - 1 ) #evaluate algorithm算法评估 #分成5份,进行交叉验证 n_folds = 5 #迭代次数 max_depth = 10 min_size = 1 sample_size = 1.0 #调参,TODO,准确性与多样性之间的权衡 n_features = 15 # n_features = int (sqrt(len(dataset[0]) - 1)) #随机森林的树的选择,理论上越多越好 for n_trees in [1,10,20]: #python中将函数作为另一个函数的参数传入 scores = rf.evaluate_algorithm(dataset, rf.random_forest, n_folds,max_depth, min_size,sample_size,n_trees,n_features) print('Trees:%d' % n_trees) print('Scores:%s' % scores) print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores)))) exit()