数据导入
import numpy as np
import pandas as pd
# 使用pandas读取csv数据,数据类型为dataframe,相当于字典加数组,第一行为索引特征
data = pd.read_csv('data/kaggle_house_price_prediction/kaggle_hourse_price_train.csv') #读入分隔好的数据
data = pd.read_csv('data/wine_quality/winequality-white.csv', delimiter=";") #读入用';'分隔的数据
# 使用np读取数据,数据类型为矩阵
spambase = np.loadtxt('data/spambase/spambase.data', delimiter = ",")
数据预处理(Dataframe)
# 丢弃有缺失值的特征(列)
data.dropna(axis = 1, inplace = True)
# 丢弃某个特征
x = data.drop('quality', axis = 1, inplace = False)
#保留数值特征
data = data.select_dtypes(exclude=['object'])
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
#保留分类特征
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and
X_train_full[cname].dtype == "object"]
#另一种丢弃特征的方法
features = data.columns.tolist()
target = 'SalePrice'
features.remove(target)
# 对数据进行预处理,将safe_loans作为标记,将正反两个特征数据合并为为一个特征数据
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
del loans['bad_loans']
#panda划分训练集测试集
from sklearn.utils import shuffle
data_shuffled = shuffle(data, random_state = 32) # 这个32不要改变
num_of_samples = data_shuffled.shape[0]
split_line = int(num_of_samples * 0.7)
train_data = data.iloc[:split_line]
test_data = data.iloc[split_line:]
#同样可以使用下面narray的train_test_split方法
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(data[features], data[target], test_size = 0.3, random_state = 32)
离散特征变成向量
def one_hot_encoding(data, features_categorical):
# 对所有的离散特征遍历
for cat in features_categorical:
# 对这列进行one-hot编码,前缀为这个变量名
one_encoding = pd.get_dummies(data[cat], prefix = cat)
# 将生成的one-hot编码与之前的dataframe拼接起来
data = pd.concat([data, one_encoding],axis=1)
# 删除掉原始的这列离散特征
del data[cat]
return data
数据预处理(Narray)
# 引入数据预处理工具
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#正则化处理
X = StandardScaler().fit_transform(X)
#60%的样本作为训练,40%作为测试
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
预测评价
# 引入评价指标
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import fowlkes_mallows_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
#十折交叉验证
from sklearn.model_selection import cross_val_predict
prediction = cross_val_predict(model, x, y, cv = 10)
模型
from sklearn.linear_model import LinearRegression #线性回归
#一元线性回归与多元线性回归区别在于使用特征的数量
#对数线性回归:先对训练集的trainY取np.log,预测时对预测出来的值取np.exp
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis #线性判别分析
from sklearn.linear_model import LogisticRegression #对数几率回归
from sklearn.tree import DecisionTreeClassifier #决策树处理分类任务
model = DecisionTreeClassifier(max_depth = 11,max_leaf_nodes=14) #参数:深度与叶子节点数
from sklearn.tree import DecisionTreeRegressor #决策树处理回归任务
from sklearn.ensemble import RandomForestRegressor #随机森林
from sklearn.neural_network import MLPClassifier #多层感知机
model = MLPClassifier(solver = 'sgd', learning_rate = 'constant', momentum = 0, learning_rate_init = 0.1, max_iter = 500
from sklearn.neural_network import MLPRegressor
from sklearn.cluster import DBSCAN#m密度聚类
from sklearn.mixture import GaussianMixture#高斯混合模型
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
绘图
depths = np.arange(1,10)
accuracys= []
precisions= []
recalls = []
f1s = []
for i in depths:
model = DecisionTreeClassifier(max_depth = i)
prediction = cross_val_predict(model, dota2x, dota2y, cv = 10)
acc = accuracy_score(data, prediction)
precision = precision_score(dota2y, prediction)
recall = recall_score(dota2y, prediction)
f1 = f1_score(dota2y, prediction)
accuracys.append(acc)
precisions.append(precision)
recalls.append(recall)
f1s.append(f1)
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(depths,accuracys,label="accuracy score")
ax.set_xlabel("maxdepth")
ax.set_ylabel("accuracy")
ax.set_title("Decision tree Classifier")
ax.legend(framealpha=1)
plt.show()