Fork me on GitHub

Sklearn--(SVR)Regression学习笔记

今天介绍一个机器学习包,sklearn。其功能模块有regression\classification\clustering\Dimensionality reduction\data preprocessing\model selection

对我来说,常用的主要有regression(SVR)和classification(SVC)两个部分。

首先介绍一下用sklearn.svm.SVR来做回归,如下:

1)多元线性回归

import numpy as np
from sklearn.linear_model import LinearRegression

rng = np.random.RandomState(10)  # 设置随机局部种子
x = 100 * rng.rand(50, 3)  # 设置一个50行3列  所有值乘100的随机矩阵

x1 = x[:, 0]
x1.shape = 50, 1
x2 = x[:, 1]
x2.shape = 50, 1
x3 = x[:, 2]
x3.shape = 50, 1
y = 1.25 * x1 + 2.5 * x2 + 3 * x3 + 10 + rng.randn(50, 1)  # randn是标准正态分布,用于核验结果


model = LinearRegression(fit_intercept=True)
model.fit(x, y) 


a = np.linspace(0, 50, 1000)  # 从0到50创建1000个等差数列,验证模型
x1_fit = a[:, np.newaxis]  # 将a转置成列
x2_fit = a[:, np.newaxis]
x3_fit = a[:, np.newaxis]
x_fit = np.hstack((x1_fit, x2_fit, x3_fit))  # 将x1,x2,x3合并一起
y_fit = model.predict(x_fit)  # 对y预测
print("Model slope: ", model.coef_[0])
print("Model intercept:", model.intercept_)
print('方程的判定系数(R^2): %.2f' % model.score(x, y)) #计算得分,R^2

2)多项式回归

import random
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

x_data, y_data = [], []
# 随机生成30个点
for x in range(-10, 20):
    y = -  x ** 2 + 5 * x - 10 + random.random() * 20
    x_data.append([x])
    y_data.append([y])

# 特征构造
poly_reg = PolynomialFeatures(degree=2)  #多项式构造
x_poly = poly_reg.fit_transform(x_data)


# 创建线性模型
linear_reg = LinearRegression()
linear_reg.fit(x_poly, y_data)
plt.plot(x_data, y_data, 'b.')
# 用特征构造数据进行预测
plt.plot(x_data, linear_reg.predict(poly_reg.fit_transform(x_data)), 'r')
plt.show()

3)非线性回归(一元为例)

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV #自动选择最佳模型 from sklearn.tree import DecisionTreeRegressor #决策树

from sklearn.ensemble import RandomForestRegressor #随机森林
import numpy as np
import matplotlib.pyplot as plt


x = np.array([68.67,54.351,92.991,80.39,64.46]).reshape(-1, 1)  #reshape为(-1,1),里面是[[1],[2]...]
y = np.array([68.67,54.351,92.991,80.39,64.46]).reshape(-1, 1)

# 选择模型
#model = SVR(kernel='rbf')
# model = DecisionTreeRegressor()
# model = RandomForestRegressor()
model = GridSearchCV(SVR(), param_grid={"kernel": ("linear", 'rbf', 'sigmoid'), "C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)})
model.fit(x, y)

xneed = np.arrray([[1.2],[3.6]])
y_pre = model.predict(xneed)# 进行预测

plt.scatter(x, y, c='k', label='data', zorder=1)
plt.plot(xneed, y_pre, c='r', label='SVR_fit')
plt.show()
print(model.best_params_)

 

补充:

1.如果要划分训练样本和测试样本数据集。

from sklearn.model_selection import  train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3) #选取0.3的测试集

2.为了增强数据之间相关性,通常对数据进行预处理,如标准化。

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_std = scaler.fit_transform(x)  # 标准化

3.可以用GridSearchCV自动选择最佳模型

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(svc, param_grid, cv=3, n_jobs=-1)

4.模型保存

   from sklearn.externals import joblib   #用于保存和读取模型pkl

   joblib.dump(model, 'svr.pkl')     # 保存模型

   svr = joblib.load('svr.pkl')        # 读取模型

 

 过两天补充一下sklearn.svm.SVC...

 

 

 

 

 

 

 

posted @ 2019-12-03 22:07  Rser_ljw  阅读(4976)  评论(0编辑  收藏  举报