机器学习入门---逻辑斯蒂 随机梯度上升
# -*- coding: utf-8 -*- import pandas import numpy #create featrue list column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class'] #获取数据 data = pandas.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', names=column_name) # print(type(data)) #将数据中的空值替换为? data = data.replace(to_replace='?', value=numpy.nan) #删除键值存在空值的数据 data = data.dropna(how='any') from sklearn.model_selection import train_test_split #将数据集分为 训练集和测试集(X_train:训练数据,y_train:训练标记/特征;X_test:测试数据集,y_test:测试标记、特征) X_train, X_test, y_train, y_test = train_test_split(data[column_name[1:10]], data[column_name[10]], test_size=0.25, random_state=33) from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.linear_model import SGDClassifier ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) lr = LogisticRegression() sc = SGDClassifier() lr.fit(X_train, y_train) lr_y_predict = lr.predict(X_test) sc.fit(X_train, y_train) sc_y_predict = sc.predict(X_test) from sklearn.metrics import classification_report print('逻辑斯蒂模型-准确率'.decode('utf-8'), lr.score(X_test, y_test)) print(classification_report(y_test,lr_y_predict, target_names=['良性','恶性'])) print('随机梯度上升模型-准确率'.decode('utf-8'), sc.score(X_test, y_test)) print(classification_report(y_test,sc_y_predict, target_names=['良性','恶性'])) # print(lr_y_predict) # print(sc_y_predict)
《机器学习及实践》 阅读记录。
线性分类:
1.逻辑斯蒂算法;
2.随机梯度上升算法;
线性分类器是最常用和最基础的机器学习模型。逻辑斯蒂(LogisticRegression)和随机梯度上升(SGDClassifier)前者对参数采用精确解析的方式,计算时间场但模型性能较高,后者采用随机梯度上升估算模型参数,计算时间短但产出的模型性能略低。