莫烦Python:Scikit-learn (sklearn) 优雅地学会机器学习笔记(2)

cross-validation 交叉验证2

'''cross-validation (2)'''
from sklearn.model_selection import learning_curve           # 可视化学习的过程,怎样降低误差的
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np

'''载入数据'''
digits = load_digits()
X = digits.data
y = digits.target

'''学习曲线上显示学习的长度(就是在图上0.1..这些点上显示),训练和测试误差'''
train_sizes, train_loss, test_loss = learning_curve(
    SVC(gamma=0.001), X, y, cv=10, scoring='neg_mean_squared_error',
    train_sizes=[0.1, 0.25, 0.5, 0.75, 1])
train_loss_mean = -np.mean(train_loss, axis=1)              # 因为loss一般是负的,所以用负号
test_loss_mean = -np.mean(test_loss, axis=1)                # axis=1,表示按列表的列方向进行平均

'''display'''
plt.plot(train_sizes, train_loss_mean, 'o-', color="r", label="Training")
plt.plot(train_sizes, test_loss_mean, 'o-', color="g", label="Cross-validation")

plt.xlabel("Training example")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.show()

gamma = 0.001
在这里插入图片描述
gamma = 0.01
在这里插入图片描述
过拟合:对训练集可以很好的拟合效果,但是测试集的效果就会很差

怎样解决过拟合

如何选取合适的gamma,既不会出现过拟合,也不会出现欠拟合
在这里插入图片描述

保存model

'''保存model'''
from sklearn import svm
from sklearn import datasets
import pickle

# clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
# clf.fit(X, y)

# method 1 : pickle

with open('save/clf.pickle', 'rb') as f:
    clf2 = pickle.load(f)
    print(clf2.predict(X[0:1]))
    
'''运行结果:[0]'''



from sklearn import svm
from sklearn import datasets
import pickle
#
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X, y)
#
# # method 1 : pickle

# with open('save/clf.pickle', 'rb') as f:
# #     clf2 = pickle.load(f)
# #     print(clf2.predict(X[0:1]))
# #
# # '''运行结果:[0]'''

# method 2 : joblib
from sklearn.externals import joblib        # sklearn 的外部模块
# Save
joblib.dump(clf, 'save/clf.pkl')
# restore
clf3 = joblib.load('save/clf.pkl')
print(clf3.predict(X[0:1]))
posted @ 2019-12-22 09:04  旅人_Eric  阅读(129)  评论(0编辑  收藏  举报