sklearn入门
本文目的是认识sklearn库的一些基本概念,了解sklearn提供的常用功能。
官方网站
- Getting Start:https://scikit-learn.org/stable/getting_started.html
- sklearn主页:https://scikit-learn.org/stable/
训练和预测
sklearn定义好了一系列的机器学习算法和模型,称为estimators
,调用其fit
方法训练模型,下面是一个简单的例子
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
X = [[ 1, 2, 3], # 2 samples, 3 features
y = [0, 1] # classes of each sample
print(clf.fit(X, y))
# RandomForestClassifier(random_state=0)
clf.predict(X) # predict classes of the training data
# array([0, 1])
clf.predict([[4, 5, 6], [14, 15, 16]]) # predict classes of new data
# array([0, 1])
fit
函数的参数
-
X
数据的特征,
(n_samples, n_features)
-
y
数据对应的标签
预处理
预处理类继承了estimator
类,先调用fit
函数然后调用transform
函数获取预处理结果
from sklearn.preprocessing import StandardScaler
X = [[0, 15], [1, -10]]
# scale data according to computed scaling values
print(StandardScaler().fit(X).transform(X))
# array([[-1., 1.],
# [ 1., -1.]])
pipeline
将预处理和训练模型整合成为一个对象称为pipeline
,下面是一个具体的例子
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# create a pipeline object
pipe = make_pipeline(
StandardScaler(),
LogisticRegression()
)
# load the iris dataset and split it into train and test sets
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# fit the whole pipeline
print(pipe.fit(X_train, y_train))
# we can now use it like any other estimator
acc = accuracy_score(pipe.predict(X_test), y_test)
print(acc)
# 0.9736842105263158
交叉验证
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
X, y = make_regression(n_samples=1000, random_state=0)
lr = LinearRegression()
result = cross_validate(lr, X, y) # defaults to 5-fold CV
print(result['test_score']) # r_squared score is high because dataset is easy
# [1. 1. 1. 1. 1.]
超参数自动搜索
sklearn提供了自动搜索最优组合超参数的类
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# define the parameter space that will be searched over
param_distributions = {'n_estimators': randint(1, 5), 'max_depth': randint(5, 10)}
# now create a searchCV object and fit it to the data
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0), n_iter=5,
param_distributions=param_distributions, random_state=0)
print(search.fit(X_train, y_train))
print(search.best_params_)
# {'max_depth': 9, 'n_estimators': 4}
# the search object now acts like a normal random forest estimator
# with max_depth=9 and n_estimators=4
print(search.score(X_test, y_test))
# 0.735363411343253