Python机器学习基础教程代码记录

from sklearn.datasets import load_iris
import numpy as np 
from scipy import sparse 
import matplotlib.pyplot as plt 
import pandas as pd 
from IPython.display import display 
import mglearn
from sklearn.model_selection import train_test_split
iris_dataset=load_iris()
import datetime
print('Keys of iris_dataset:\n{}'.format(iris_dataset.keys()))
Keys of iris_dataset:
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
print(iris_dataset['DESCR'][:193]+'\n...')
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, pre
...
print('Target names:{}'.format(iris_dataset['target_names']))
Target names:['setosa' 'versicolor' 'virginica']
print('Feature names:\n{}'.format(iris_dataset['feature_names']))
Feature names:
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
print('Type of data:{}'.format(type(iris_dataset['data'])))
Type of data:<class 'numpy.ndarray'>
print('Shape of data:{}'.format(iris_dataset['data'].shape))
Shape of data:(150, 4)
print('First five ros of data:\n{}'.format(iris_dataset['data'][:5]))
First five ros of data:
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
print('Type of target:{}'.format(type(iris_dataset['target'])))
Type of target:<class 'numpy.ndarray'>
print('Shape of target:{}'.format(iris_dataset['target'].shape))
Shape of target:(150,)
print('Target:\n{}'.format(iris_dataset['target']))
Target:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(iris_dataset['data'],iris_dataset['target'],random_state=0)
print('X_train shape:{}'.format(X_train.shape))
print('y_train shape:{}'.format(y_train.shape))
X_train shape:(112, 4)
y_train shape:(112,)
print('X_test shape:{}'.format(X_test.shape))
print('y_test.shape:{}'.format(y_test.shape))
X_test shape:(38, 4)
y_test.shape:(38,)
import pandas as pd
iris_dataframe=pd.DataFrame(X_train,columns=iris_dataset['feature_names'])
grr=pd.plotting.scatter_matrix(iris_dataframe,c=y_train,figsize=(15,15),marker='o',hist_kwds={'bins':30},s=60,alpha=0.8,cmap=mglearn.cm3)

png

from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
KNeighborsClassifier(n_neighbors=1)
X_new=np.array([[5,2.9,1,0.2]])
print('X_new.shape:{}'.format(X_new.shape))
X_new.shape:(1, 4)
prediction=knn.predict(X_new)
print('Prediction:{}'.format(prediction))
print('Prediction target name:{}'.format(iris_dataset['target_names'][prediction]))
Prediction:[0]
Prediction target name:['setosa']
y_pred=knn.predict(X_test)
print('Test set predictions:\n{}'.format(y_pred))
Test set predictions:
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
print('Test set score:{:.2f}'.format(np.mean(y_pred==y_test)))
Test set score:0.97
print('Test set score:{:.2f}'.format(knn.score(X_test,y_test)))
Test set score:0.97
X,y=mglearn.datasets.make_forge()
mglearn.discrete_scatter(X[:,0],X[:,1],y)
plt.legend(['Class 0','Class 1'],loc=4)
plt.xlabel('First feature')
plt.ylabel('Second feature')
print('X.shape:{}'.format(X.shape))
X.shape:(26, 2)


C:\Users\reion\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:86: FutureWarning: Function make_blobs is deprecated; Please import make_blobs directly from scikit-learn
  warnings.warn(msg, category=FutureWarning)

png

X,y=mglearn.datasets.make_wave(n_samples=40)
plt.plot(X,y,'o')
plt.ylim(-3,3)
plt.xlabel("Feature")
plt.ylabel("Target")
Text(0, 0.5, 'Target')

png

from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
print('cancer.keys():\n{}'.format(cancer.keys()))
cancer.keys():
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
print('Shape of cancer data:{}'.format(cancer.data.shape))
Shape of cancer data:(569, 30)
print('Sample counts per class:\n{}'.format({n:v for n,v in zip(cancer.target_names,np.bincount(cancer.target))}))
Sample counts per class:
{'malignant': 212, 'benign': 357}
print('Feature names:{}'.format(cancer.feature_names))
Feature names:['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
from sklearn.datasets import load_boston
boston=load_boston()
print('Data shape:{}'.format(boston.data.shape))
Data shape:(506, 13)
mglearn.plots.plot_knn_classification(n_neighbors=1)
C:\Users\reion\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:86: FutureWarning: Function make_blobs is deprecated; Please import make_blobs directly from scikit-learn
  warnings.warn(msg, category=FutureWarning)

png

mglearn.plots.plot_knn_classification(n_neighbors=3)
C:\Users\reion\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:86: FutureWarning: Function make_blobs is deprecated; Please import make_blobs directly from scikit-learn
  warnings.warn(msg, category=FutureWarning)

png

from sklearn.model_selection import train_test_split
X,y=mglearn.datasets.make_forge()
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
C:\Users\reion\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:86: FutureWarning: Function make_blobs is deprecated; Please import make_blobs directly from scikit-learn
  warnings.warn(msg, category=FutureWarning)
from sklearn.neighbors import KNeighborsClassifier
clf=KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train,y_train)
KNeighborsClassifier(n_neighbors=3)
print('Test set predictions :{}'.format(clf.predict(X_test)))
Test set predictions :[1 0 1 0 1 0 0]
print('Test set accuracy:{:.2f}'.format(clf.score(X_test,y_test)))
Test set accuracy:0.86
fig,axes=plt.subplots(1,3,figsize=(10,3))
for n_neighbors ,ax in zip([1,3,9],axes):
    clf=KNeighborsClassifier(n_neighbors=n_neighbors).fit(X,y)
    mglearn.plots.plot_2d_separator(clf,X,fill=True,eps=0.5,ax=ax,alpha=0.4)
    mglearn.discrete_scatter(X[:,0],X[:,1],y,ax=ax)
    ax.set_title('{} neighbor(s)'.format(n_neighbors))
    ax.set_xlabel('feature 0')
    ax.set_ylabel('feature 1')

png

from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(cancer['data'],cancer['target'],stratify=cancer.target,random_state=66)
training_accuracy=[]
testing_accuracy=[]
neighbors_setting=range(1,11)
for n_neighbors in neighbors_setting:
    clf=KNeighborsClassifier(n_neighbors=n_neighbors)
    clf.fit(X_train,y_train)
    training_accuracy.append(clf.score(X_train,y_train))
    testing_accuracy.append(clf.score(X_test,y_test))
plt.plot(neighbors_setting,training_accuracy,label='training accuracy')
plt.plot(neighbors_setting,testing_accuracy,label='test accuracy')
plt.ylabel('Accuracy')
plt.xlabel('n_neighbors')
plt.legend()
<matplotlib.legend.Legend at 0x2eac8a873a0>

png

mglearn.plots.plot_knn_regression(n_neighbors=1)

png

mglearn.plots.plot_knn_regression(n_neighbors=3)

png

from sklearn.neighbors import KNeighborsRegressor
X,y=mglearn.datasets.make_wave(n_samples=40)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
reg=KNeighborsRegressor(n_neighbors=3)
reg.fit(X_train,y_train)
KNeighborsRegressor(n_neighbors=3)
print('Test set predictions:\n{}'.format(reg.predict(X_test)))
Test set predictions:
[-0.05396539  0.35686046  1.13671923 -1.89415682 -1.13881398 -1.63113382
  0.35686046  0.91241374 -0.44680446 -1.13881398]
print('Test set R^2: {:.2f}'.format(reg.score(X_test,y_test)))
Test set R^2: 0.83
fig,axes=plt.subplots(1,3,figsize=(15,4))
line=np.linspace(-3,3,1000).reshape(-1,1)
for n_neighbors ,ax in zip([1,3,9],axes):
    reg=KNeighborsRegressor(n_neighbors=n_neighbors)
    reg.fit(X_train,y_train)
    ax.plot(line,reg.predict(line))
    ax.plot(X_train,y_train,'^',c=mglearn.cm2(0),markersize=8)
    ax.plot(X_test,y_test,c=mglearn.cm2(1),markersize=8)
    ax.set_title(
        '{} neighbors(s)\n train score:{:.2f} test score:{:.2f}'.format(
            n_neighbors,reg.score(X_train,y_train),reg.score(X_test,y_test)
        )
    )
    ax.set_xlabel('Feature')
    ax.set_ylabel('Target')
axes[0].legend(['Model predictions','Training data/target','Test data/targrt'],loc='best')
<matplotlib.legend.Legend at 0x2eac8517550>

png

mglearn.plots.plot_linear_regression_wave()
w[0]: 0.393906  b: -0.031804

png

from sklearn.linear_model import LinearRegression
X,y=mglearn.datasets.make_wave(n_samples=60)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)
lr=LinearRegression().fit(X_train,y_train)
print('lr.coef_:{}'.format(lr.coef_))
print('lr.intercept_:{}'.format(lr.intercept_))
lr.coef_:[0.39390555]
lr.intercept_:-0.031804343026759746
print("Training set score:{:.2f}".format(lr.score(X_train,y_train)))
print('Test set score:{:.2f}'.format(lr.score(X_test,y_test)))
Training set score:0.67
Test set score:0.66
X,y=mglearn.datasets.load_extended_boston()
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
lr=LinearRegression().fit(X_train,y_train)
print("Train set score:{:.2f}".format(lr.score(X_train,y_train)))
print('Test set score:{:.2f}'.format(lr.score(X_test,y_test)))
Train set score:0.95
Test set score:0.61
from sklearn.linear_model import Ridge
ridge=Ridge().fit(X_train,y_train)
print("Training set score:{:.2f}".format(ridge.score(X_train,y_train)))
print("Test set score:{:.2f}".format(ridge.score(X_test,y_test)))
Training set score:0.89
Test set score:0.75
ridge10=Ridge(alpha=10).fit(X_train,y_train)
print("Training set score:{:.2f}".format(ridge10.score(X_train,y_train)))
print("Test set score:{:.2f}".format(ridge10.score(X_test,y_test)))
Training set score:0.79
Test set score:0.64
ridge01=Ridge(alpha=0.1).fit(X_train,y_train)
print("Training set score:{:.2f}".format(ridge01.score(X_train,y_train)))
print("Test set score:{:.2f}".format(ridge01.score(X_test,y_test)))
Training set score:0.93
Test set score:0.77
plt.plot(ridge.coef_,'s',label='Ridge alpha=1')
plt.plot(ridge10.coef_,'^',label='Ridge alpha=10')
plt.plot(ridge01.coef_,'v',label='Ridge alpha=0.1')
plt.plot(lr.coef_,'o',label='LinearRegression')
plt.xlabel('Coefficient index')
plt.ylabel('Coefficient magnitude')
plt.hlines(0,0,len(lr.coef_))
plt.ylim(-25,25)
plt.legend()
<matplotlib.legend.Legend at 0x2eac87cb070>

png

mglearn.plots.plot_ridge_n_samples()

png

from sklearn.linear_model import Lasso
lasso=Lasso().fit(X_train,y_train)
print("Training set score:{:.2f}".format(lasso.score(X_train,y_train)))
print("Test set score:{:.2f}".format(lasso.score(X_test,y_test)))
print('Number of features used:{}'.format(np.sum(lasso.coef_!=0)))
Training set score:0.29
Test set score:0.21
Number of features used:4
lasso001=Lasso(alpha=0.01,max_iter=100000).fit(X_train,y_train)
print("Training set score:{:.2f}".format(lasso001.score(X_train,y_train)))
print("Test set score:{:.2f}".format(lasso001.score(X_test,y_test)))
print('Number of features used:{}'.format(np.sum(lasso001.coef_!=0)))
Training set score:0.90
Test set score:0.77
Number of features used:33
lasso00001=Lasso(alpha=0.0001,max_iter=100000).fit(X_train,y_train)
print("Training set score:{:.2f}".format(lasso00001.score(X_train,y_train)))
print("Test set score:{:.2f}".format(lasso00001.score(X_test,y_test)))
print('Number of features used:{}'.format(np.sum(lasso00001.coef_!=0)))
Training set score:0.95
Test set score:0.64
Number of features used:96
plt.plot(lasso.coef_,'s',label='Lasso alpha=1')
plt.plot(lasso001.coef_,'^',label='Lasso alpha=0.01')
plt.plot(lasso00001.coef_,'v',label='Lasso alpha=0.0001')
plt.plot(ridge01.coef_,'o',label='Ridge alpha=0.1')
plt.legend(ncol=2,loc=(0,1.05))
plt.ylim(-25,25)
plt.xlabel('Coefficient index')
plt.ylabel('Coefficient magnitude')
print(lasso.coef_)
[-0.          0.         -0.          0.         -0.          0.
 -0.          0.         -0.         -0.         -0.          0.
 -5.3529079  -0.          0.         -0.          0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.         -0.          0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.          0.          0.          0.         -0.
 -0.         -0.          0.         -0.         -0.          0.
 -0.         -1.05063037 -3.3104274  -0.         -0.          0.
 -0.         -0.         -0.          0.         -0.         -0.41386744
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.          0.
 -0.         -0.        ]

png

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
X,y=mglearn.datasets.make_forge()
fig,axes=plt.subplots(1,2,figsize=(10,3))
for model,ax in zip([LinearSVC(),LogisticRegression()],axes):
    clf=model.fit(X,y)
    mglearn.plots.plot_2d_separator(clf,X,fill=False,eps=0.5,ax=ax,alpha=0.7)
    mglearn.discrete_scatter(X[:,0],X[:,1],y,ax=ax)
    ax.set_title('{}'.format(clf.__class__.__name__))
    ax.set_xlabel('Feature 0')
    ax.set_ylabel('Feature 1')
axes[0].legend()
C:\Users\reion\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:86: FutureWarning: Function make_blobs is deprecated; Please import make_blobs directly from scikit-learn
  warnings.warn(msg, category=FutureWarning)
C:\Users\reion\anaconda3\lib\site-packages\sklearn\svm\_base.py:985: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn("Liblinear failed to converge, increase "





<matplotlib.legend.Legend at 0x2eacb6e8a90>

png

mglearn.plots.plot_linear_svc_regularization()

png

from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_state=42)
logreg=LogisticRegression().fit(X_train,y_train)
print('Training set score:{:.3f}'.format(logreg.score(X_train,y_train)))
print("Test set score:{:.3f}".format(logreg.score(X_test,y_test)))
Training set score:0.955
Test set score:0.958


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
logreg100=LogisticRegression(C=100).fit(X_train,y_train)
print('Training set score:{:.3f}'.format(logreg100.score(X_train,y_train)))
print("Test set score:{:.3f}".format(logreg100.score(X_test,y_test)))
Training set score:0.944
Test set score:0.958


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
logreg001=LogisticRegression(C=0.01).fit(X_train,y_train)
print('Training set score:{:.3f}'.format(logreg001.score(X_train,y_train)))
print("Test set score:{:.3f}".format(logreg001.score(X_test,y_test)))
Training set score:0.937
Test set score:0.930


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
plt.plot(logreg.coef_.T,'o',label='C=1')
plt.plot(logreg100.coef_.T,'^',label='C=100')
plt.plot(logreg001.coef_.T,'v',label='C=0.01')
print(logreg.coef_.T)
plt.xticks(range(cancer.data.shape[1]),cancer.feature_names,rotation=90)
plt.hlines(0,0,cancer.data.shape[1])
plt.ylim(-5,5)
plt.xlabel('Coefficient index')
plt.ylabel('Coefficient magnitude')
plt.legend()
[[ 1.61353724]
 [ 0.05653993]
 [-0.0379765 ]
 [ 0.00296929]
 [-0.07199378]
 [-0.31364118]
 [-0.43808733]
 [-0.18276726]
 [-0.11456157]
 [-0.02433269]
 [ 0.06416621]
 [ 0.84241837]
 [ 0.35199832]
 [-0.10429922]
 [-0.00674319]
 [-0.06786874]
 [-0.09653963]
 [-0.0240688 ]
 [-0.02813081]
 [-0.00614864]
 [ 1.57958217]
 [-0.26155772]
 [-0.16723925]
 [-0.02556688]
 [-0.12867136]
 [-1.01367658]
 [-1.26659045]
 [-0.36289883]
 [-0.35240931]
 [-0.10159592]]





<matplotlib.legend.Legend at 0x2eacb685be0>

png

for C,marker in zip([0.001,1,100],['o','^','v']):
    lr_l1=LogisticRegression(C=C,penalty='l1',solver='liblinear').fit(X_train,y_train)
    print('Training accuracy of l1 logreg with C={:.3f}:{:.2f}'.format(C,lr_l1.score(X_train,y_train)))
    print('Test accuracy of l1 logreg with C={:.3f}:{:.2f}'.format(C,lr_l1.score(X_test,y_test)))
plt.plot(lr_l1.coef_.T,marker,label='C={:.3f}'.format(C))
plt.xticks(range(cancer.data.shape[1]),cancer.feature_names,rotation=90)
plt.hlines(0,0,cancer.data.shape[1])
plt.xlabel("Coefficient index")
plt.ylabel('Coefficient magnitude')
plt.ylim(-5,5)
plt.legend(loc=3)
Training accuracy of l1 logreg with C=0.001:0.91
Test accuracy of l1 logreg with C=0.001:0.92
Training accuracy of l1 logreg with C=1.000:0.96
Test accuracy of l1 logreg with C=1.000:0.96
Training accuracy of l1 logreg with C=100.000:0.99
Test accuracy of l1 logreg with C=100.000:0.98


C:\Users\reion\anaconda3\lib\site-packages\sklearn\svm\_base.py:985: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn("Liblinear failed to converge, increase "





<matplotlib.legend.Legend at 0x2eacb8f71f0>

png

from sklearn.datasets import make_blobs
X,y=make_blobs(random_state=42)
mglearn.discrete_scatter(X[:,0],X[:,1],y)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
plt.legend(['Class 0','Class 1','Class 2'])
<matplotlib.legend.Legend at 0x2eacb861640>

png

from sklearn.svm import LinearSVC
linear_svm=LinearSVC().fit(X,y)
print('Coefficient shape:',linear_svm.coef_.shape)
print('Intercept shape:',linear_svm.intercept_.shape)
Coefficient shape: (3, 2)
Intercept shape: (3,)
mglearn.discrete_scatter(X[:,0],X[:,1],y)
line=np.linspace(-15,15)
for coef ,intercept,color in zip(linear_svm.coef_,linear_svm.intercept_,['b','r','g']):
    plt.plot(line,-(line*coef[0]+intercept)/coef[1],c=color)
plt.ylim(-10,15)
plt.xlim(-10,8)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
plt.legend(['Class 0','Class 1','Class 2','Linear class 0','Linear class 1','Line class 2'],loc=(1.01,0.3))
print(line)
print(-(line*coef[0]+intercept)/coef[1])
[-15.         -14.3877551  -13.7755102  -13.16326531 -12.55102041
 -11.93877551 -11.32653061 -10.71428571 -10.10204082  -9.48979592
  -8.87755102  -8.26530612  -7.65306122  -7.04081633  -6.42857143
  -5.81632653  -5.20408163  -4.59183673  -3.97959184  -3.36734694
  -2.75510204  -2.14285714  -1.53061224  -0.91836735  -0.30612245
   0.30612245   0.91836735   1.53061224   2.14285714   2.75510204
   3.36734694   3.97959184   4.59183673   5.20408163   5.81632653
   6.42857143   7.04081633   7.65306122   8.26530612   8.87755102
   9.48979592  10.10204082  10.71428571  11.32653061  11.93877551
  12.55102041  13.16326531  13.7755102   14.3877551   15.        ]
[ 13.48522515  12.91759141  12.34995767  11.78232394  11.2146902
  10.64705646  10.07942272   9.51178898   8.94415524   8.37652151
   7.80888777   7.24125403   6.67362029   6.10598655   5.53835281
   4.97071908   4.40308534   3.8354516    3.26781786   2.70018412
   2.13255038   1.56491665   0.99728291   0.42964917  -0.13798457
  -0.70561831  -1.27325205  -1.84088579  -2.40851952  -2.97615326
  -3.543787    -4.11142074  -4.67905448  -5.24668822  -5.81432195
  -6.38195569  -6.94958943  -7.51722317  -8.08485691  -8.65249065
  -9.22012438  -9.78775812 -10.35539186 -10.9230256  -11.49065934
 -12.05829308 -12.62592681 -13.19356055 -13.76119429 -14.32882803]

png

mglearn.plots.plot_2d_classification(linear_svm,X,fill=True,alpha=0.7)
mglearn.discrete_scatter(X[:,0],X[:,1],y)
line=np.linspace(-15,15)
for coef ,intercept,color in zip(linear_svm.coef_,linear_svm.intercept_,['b','r','g']):
    plt.plot(line,-(line*coef[0]+intercept)/coef[1],c=color)
plt.legend(['Class 0','Class 1','Class 2','Linear class 0','Linear class 1','linear class 2'],loc=(1.01,0.3))
plt.xlabel("Feature 0")
plt.ylabel('Feature 1')
Text(0, 0.5, 'Feature 1')

png

X=np.array([[0,1,0,1],
[1,0,1,1],[0,0,0,1],[1,0,1,0]])
y=np.array([0,1,0,1])
counts={}
for label in np.unique(y):
    counts[label]=X[y==label].sum(axis=0)
print('Feature counts:\n{}'.format(counts))
Feature counts:
{0: array([0, 1, 0, 2]), 1: array([2, 0, 2, 1])}
# mglearn.plots.plot_animal_tree()
from sklearn.tree import DecisionTreeClassifier
cancer=load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,random_state=42,stratify=cancer.target)
tree=DecisionTreeClassifier(random_state=0)
tree.fit(X_train,y_train)
print('Accuracy on training set:{:.3f}'.format(tree.score(X_train,y_train)))
print('Accuracy on test set:{:.3f}'.format(tree.score(X_test,y_test)))
Accuracy on training set:1.000
Accuracy on test set:0.937
tree=DecisionTreeClassifier(max_depth=4,random_state=0)
tree.fit(X_train,y_train)
print('Accuracy on training set:{:.3f}'.format(tree.score(X_train,y_train)))
print('Accuracy on test set:{:.3f}'.format(tree.score(X_test,y_test)))
Accuracy on training set:0.988
Accuracy on test set:0.951
from sklearn.tree import export_graphviz
export_graphviz(tree,out_file='tree.dot',class_names=['malignant','benign'],feature_names=cancer.feature_names,impurity=False,filled=True)
# import graphviz
# with open(r'C:\Users\reion\Documents\Python机器学习基础教程\tree.dot') as f:
#     dot_graph=f.read()
# graphviz.Source(dot_graph)
print('Feature importance :\n{}'.format(tree.feature_importances_))
Feature importance :
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.01019737 0.04839825
 0.         0.         0.0024156  0.         0.         0.
 0.         0.         0.72682851 0.0458159  0.         0.
 0.0141577  0.         0.018188   0.1221132  0.01188548 0.        ]
def plot_feature_importances_cancer(model):
    n_feature=cancer.data.shape[1]
    plt.barh(range(n_feature),model.feature_importances_,align='center')
    plt.yticks(np.arange(n_feature),cancer.feature_names)
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')
plot_feature_importances_cancer(tree)

png

# tree=mglearn.plots.plot_tree_not_monotone()
# display(tree)
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
X,y=make_moons(n_samples=100,noise=0.25,random_state=3)
X_train,X_test,y_train,y_test=train_test_split(X,y,stratify=y,random_state=42)
forest=RandomForestClassifier(n_estimators=5,random_state=2)
forest.fit(X_train,y_train)
RandomForestClassifier(n_estimators=5, random_state=2)
fig,axes=plt.subplots(2,3,figsize=(20,10))
for i,(ax,tree) in enumerate(zip(axes.ravel(),forest.estimators_)):
    ax.set_title('Tree{}'.format(i))
    mglearn.plots.plot_tree_partition(X_train,y_train,tree,ax=ax)
mglearn.plots.plot_2d_separator(forest,X_train,fill=True,ax=axes[-1,-1],alpha=0.4)
axes[-1,-1].set_title('RandomForest')
mglearn.discrete_scatter(X_train[:,0],X_train[:,1],y_train)
[<matplotlib.lines.Line2D at 0x2eac8a1d5e0>,
 <matplotlib.lines.Line2D at 0x2eac8a1d0d0>]

png

X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_state=0)
forest=RandomForestClassifier(n_estimators=100,random_state=0)
forest.fit(X_train,y_train)
print('Accuracy on training set:{:.3f}'.format(forest.score(X_train,y_train)))
print('Accuracy on test set:{:.3f}'.format(forest.score(X_test,y_test)))
Accuracy on training set:0.998
Accuracy on test set:0.944
plot_feature_importances_cancer(forest)

png

from sklearn.ensemble import GradientBoostingClassifier
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_state=0)
gbrt=GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train,y_train)
print('Accuracy on training set:{:.3f}'.format(gbrt.score(X_train,y_train)))
print('Accuracy on test set:{:.3f}'.format(gbrt.score(X_test,y_test)))
Accuracy on training set:1.000
Accuracy on test set:0.958
gbrt=GradientBoostingClassifier(max_depth=1,random_state=0)
gbrt.fit(X_train,y_train)
print('Accuracy on training set:{:.3f}'.format(gbrt.score(X_train,y_train)))
print('Accuracy on test set:{:.3f}'.format(gbrt.score(X_test,y_test)))
Accuracy on training set:0.995
Accuracy on test set:0.965
gbrt=GradientBoostingClassifier(random_state=0,learning_rate=0.01)
gbrt.fit(X_train,y_train)
print('Accuracy on training set:{:.3f}'.format(gbrt.score(X_train,y_train)))
print('Accuracy on test set:{:.3f}'.format(gbrt.score(X_test,y_test)))
Accuracy on training set:0.995
Accuracy on test set:0.944
gbrt=GradientBoostingClassifier(max_depth=1,random_state=0)
gbrt.fit(X_train,y_train)
plot_feature_importances_cancer(gbrt)

png

X,y=make_blobs(centers=4,random_state=8)
y=y%2
mglearn.discrete_scatter(X[:,0],X[:,1],y)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
Text(0, 0.5, 'Feature 1')

png

from sklearn.svm import LinearSVC
linear_svm=LinearSVC().fit(X,y)
mglearn.discrete_scatter(X[:,0],X[:,1],y)
mglearn.plots.plot_2d_separator(linear_svm,X,fill=False)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
C:\Users\reion\anaconda3\lib\site-packages\sklearn\svm\_base.py:985: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn("Liblinear failed to converge, increase "





Text(0, 0.5, 'Feature 1')

png

X_new=np.hstack([X,X[:,1:]**2])
from mpl_toolkits.mplot3d import Axes3D,axes3d 
figure=plt.figure()
ax=Axes3D(figure,elev=-152,azim=-26)
mask=y==0
ax.scatter(X_new[mask,0],X_new[mask,1],X_new[mask,2],c='b',cmap=mglearn.cm2,s=60)
ax.scatter(X_new[~mask,0],X_new[~mask,1],X_new[~mask,2],c='r',cmap=mglearn.cm2,s=60)
ax.set_xlabel('feature 0')
ax.set_ylabel('feature 1')
ax.set_zlabel('feature1**2')
Text(0.5, 0, 'feature1**2')

png

linear_svm_3d=LinearSVC().fit(X_new,y)
coef,intercept=linear_svm_3d.coef_.ravel(),linear_svm_3d.intercept_.ravel()
figure=plt.figure()
ax=Axes3D(figure,elev=-152,azim=-26)
xx=np.linspace(X_new[:,0].min()-2,X_new[:,0].max()+2,50)
yy=np.linspace(X_new[:,1].min()-2,X_new[:,1].max()+2,50)
XX,YY=np.meshgrid(xx,yy)
ZZ=(coef[0]*XX+coef[1]*YY+intercept)/-coef[2]
ax.plot_surface(XX,YY,ZZ,rstride=8,cstride=8,alpha=0.8)
ax.scatter(X_new[mask,0],X_new[mask,1],X_new[mask,2],c='b',cmap=mglearn.cm2,s=60)
ax.scatter(X_new[~mask,0],X_new[~mask,1],X_new[~mask,2],c='r',cmap=mglearn.cm2,s=60)
ax.set_xlabel('feature 0')
ax.set_ylabel('feature 1')
ax.set_zlabel('feature**2')
C:\Users\reion\anaconda3\lib\site-packages\sklearn\svm\_base.py:985: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn("Liblinear failed to converge, increase "





Text(0.5, 0, 'feature**2')

png

ZZ=YY**2
dec=linear_svm_3d.decision_function(np.c_[XX.ravel(),YY.ravel(),ZZ.ravel()])
plt.contourf(XX,YY,dec.reshape(XX.shape),levels=[dec.min(),0,dec.max()],cmap=mglearn.cm2,alpha=0.5)
mglearn.discrete_scatter(X[:,0],X[:,1],y)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
Text(0, 0.5, 'Feature 1')

png

from sklearn.svm import SVC
X,y=mglearn.tools.make_handcrafted_dataset()
svm=SVC(kernel='rbf',C=10,gamma=0.1).fit(X,y)
mglearn.plots.plot_2d_separator(svm,X,eps=0.5)
mglearn.discrete_scatter(X[:,0],X[:,1],y)
sv=svm.support_vectors_
sv_labels=svm.dual_coef_.ravel()>0
print(sv)
mglearn.discrete_scatter(sv[:,0],sv[:,1],sv_labels,s=10,markeredgewidth=3)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
[[ 8.1062269   4.28695977]
 [ 9.50169345  1.93824624]
 [11.563957    1.3389402 ]
 [10.24028948  2.45544401]
 [ 7.99815287  4.8525051 ]]





Text(0, 0.5, 'Feature 1')

png

fig,axes=plt.subplots(3,3,figsize=(15,10))
for ax,C in zip(axes,[-1,0,3]):
    for a ,gamma in zip(ax,range(-1,2)):
        mglearn.plots.plot_svm(log_C=C,log_gamma=gamma,ax=a)
axes[0,0].legend(['Class 0',' Class 1','sv class 0','sv class1'],ncol=4,loc=(0.9,1.2))
<matplotlib.legend.Legend at 0x2eacbc03640>

png

X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_state=0)
svc=SVC()
svc.fit(X_train,y_train)
print('Accuracy on training set:{:.3f}'.format(svc.score(X_train,y_train)))
print('Accuracy on test set:{:.3f}'.format(svc.score(X_test,y_test)))
Accuracy on training set:0.923
Accuracy on test set:0.916
plt.plot(X_train.min(axis=0),'o',label='min')
plt.plot(X_train.max(axis=0),'^',label='max')
plt.legend(loc=4)
plt.xlabel('Feature Max')
plt.ylabel('Feature magnitude')
plt.yscale('log')

png

min_on_training=X_train.min(axis=0)
max_on_training=X_train.max(axis=0)
range_on_training=(X_train-min_on_training).max(axis=0)
X_train_scaled=(X_train-min_on_training)/range_on_training
print('Minimum for each feature\n{}'.format(X_train_scaled.min(axis=0)))
print('Maximum for each feature\n{}'.format(X_train_scaled.max(axis=0)))
Minimum for each feature
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
Maximum for each feature
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]
X_test_scaled=(X_test-min_on_training)/range_on_training
svc=SVC()
svc.fit(X_train_scaled,y_train)
print('Accuracy on training set:{:.3f}'.format(svc.score(X_train_scaled,y_train)))
print('Accuracy on test set:{:.3f}'.format(svc.score(X_test_scaled,y_test)))
Accuracy on training set:0.991
Accuracy on test set:0.944
svc=SVC(C=1000)
svc.fit(X_train_scaled,y_train)
print('Accuracy on training set:{:.3f}'.format(svc.score(X_train_scaled,y_train)))
print('Accuracy on test set:{:.3f}'.format(svc.score(X_test_scaled,y_test)))
Accuracy on training set:1.000
Accuracy on test set:0.951
# display(mglearn.plots.plot_logistic_regression_graph())
display(mglearn.plots.plot_single_hidden_layer_graph())
---------------------------------------------------------------------------

FileNotFoundError                         Traceback (most recent call last)

~\anaconda3\lib\site-packages\graphviz\backend.py in run(cmd, input, capture_output, check, encoding, quiet, **kwargs)
    163     try:
--> 164         proc = subprocess.Popen(cmd, startupinfo=get_startupinfo(), **kwargs)
    165     except OSError as e:


~\anaconda3\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
    857 
--> 858             self._execute_child(args, executable, preexec_fn, close_fds,
    859                                 pass_fds, cwd, env,


~\anaconda3\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
   1310             try:
-> 1311                 hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
   1312                                          # no special security


FileNotFoundError: [WinError 2] 系统找不到指定的文件。


During handling of the above exception, another exception occurred:


ExecutableNotFound                        Traceback (most recent call last)

~\anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
    343             method = get_real_method(obj, self.print_method)
    344             if method is not None:
--> 345                 return method()
    346             return None
    347         else:


~\anaconda3\lib\site-packages\graphviz\files.py in _repr_svg_(self)
    142 
    143     def _repr_svg_(self):
--> 144         return self.pipe(format='svg').decode(self._encoding)
    145 
    146     def pipe(self, format=None, renderer=None, formatter=None, quiet=False):


~\anaconda3\lib\site-packages\graphviz\files.py in pipe(self, format, renderer, formatter, quiet)
    167         data = text_type(self.source).encode(self._encoding)
    168 
--> 169         out = backend.pipe(self._engine, format, data,
    170                            renderer=renderer, formatter=formatter,
    171                            quiet=quiet)


~\anaconda3\lib\site-packages\graphviz\backend.py in pipe(engine, format, data, renderer, formatter, quiet)
    246     """
    247     cmd, _ = command(engine, format, None, renderer, formatter)
--> 248     out, _ = run(cmd, input=data, capture_output=True, check=True, quiet=quiet)
    249     return out
    250 


~\anaconda3\lib\site-packages\graphviz\backend.py in run(cmd, input, capture_output, check, encoding, quiet, **kwargs)
    165     except OSError as e:
    166         if e.errno == errno.ENOENT:
--> 167             raise ExecutableNotFound(cmd)
    168         else:
    169             raise


ExecutableNotFound: failed to execute ['dot', '-Kdot', '-Tsvg'], make sure the Graphviz executables are on your systems' PATH



<graphviz.dot.Digraph at 0x2eacbb0c160>
line=np.linspace(-3,3,100)
plt.plot(line,np.tanh(line),label='tanh')
plt.plot(line,np.maximum(line,0),label='relu')
plt.legend(loc='best')
plt.xlabel('x')
plt.ylabel('relu(x),tanh(x)')
print(len(np.maximum(line,0)))
100

png

mglearn.plots.plot_two_hidden_layer_graph()
---------------------------------------------------------------------------

FileNotFoundError                         Traceback (most recent call last)

~\anaconda3\lib\site-packages\graphviz\backend.py in run(cmd, input, capture_output, check, encoding, quiet, **kwargs)
    163     try:
--> 164         proc = subprocess.Popen(cmd, startupinfo=get_startupinfo(), **kwargs)
    165     except OSError as e:


~\anaconda3\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
    857 
--> 858             self._execute_child(args, executable, preexec_fn, close_fds,
    859                                 pass_fds, cwd, env,


~\anaconda3\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
   1310             try:
-> 1311                 hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
   1312                                          # no special security


FileNotFoundError: [WinError 2] 系统找不到指定的文件。


During handling of the above exception, another exception occurred:


ExecutableNotFound                        Traceback (most recent call last)

~\anaconda3\lib\site-packages\IPython\core\formatters.py in __call__(self, obj)
    343             method = get_real_method(obj, self.print_method)
    344             if method is not None:
--> 345                 return method()
    346             return None
    347         else:


~\anaconda3\lib\site-packages\graphviz\files.py in _repr_svg_(self)
    142 
    143     def _repr_svg_(self):
--> 144         return self.pipe(format='svg').decode(self._encoding)
    145 
    146     def pipe(self, format=None, renderer=None, formatter=None, quiet=False):


~\anaconda3\lib\site-packages\graphviz\files.py in pipe(self, format, renderer, formatter, quiet)
    167         data = text_type(self.source).encode(self._encoding)
    168 
--> 169         out = backend.pipe(self._engine, format, data,
    170                            renderer=renderer, formatter=formatter,
    171                            quiet=quiet)


~\anaconda3\lib\site-packages\graphviz\backend.py in pipe(engine, format, data, renderer, formatter, quiet)
    246     """
    247     cmd, _ = command(engine, format, None, renderer, formatter)
--> 248     out, _ = run(cmd, input=data, capture_output=True, check=True, quiet=quiet)
    249     return out
    250 


~\anaconda3\lib\site-packages\graphviz\backend.py in run(cmd, input, capture_output, check, encoding, quiet, **kwargs)
    165     except OSError as e:
    166         if e.errno == errno.ENOENT:
--> 167             raise ExecutableNotFound(cmd)
    168         else:
    169             raise


ExecutableNotFound: failed to execute ['dot', '-Kdot', '-Tsvg'], make sure the Graphviz executables are on your systems' PATH





<graphviz.dot.Digraph at 0x2eacd37caf0>
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_moons
X,y=make_moons(n_samples=100,noise=0.25,random_state=3)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,stratify=y)
mlp=MLPClassifier(solver='lbfgs',random_state=0).fit(X_train,y_train)
mglearn.plots.plot_2d_separator(mlp,X_train,fill=True,alpha=0.3)
mglearn.discrete_scatter(X_train[:,0],X_train[:,1],y_train)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
Text(0, 0.5, 'Feature 1')

png

mlp=MLPClassifier(solver='lbfgs',random_state=0,hidden_layer_sizes=[10]).fit(X_train,y_train)
mglearn.plots.plot_2d_separator(mlp,X_train,fill=True,alpha=0.3)
mglearn.discrete_scatter(X_train[:,0],X_train[:,1],y_train)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
C:\Users\reion\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:500: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)





Text(0, 0.5, 'Feature 1')

png

mlp=MLPClassifier(solver='lbfgs',random_state=0,hidden_layer_sizes=[10,10]).fit(X_train,y_train)
mglearn.plots.plot_2d_separator(mlp,X_train,fill=True,alpha=0.3)
mglearn.discrete_scatter(X_train[:,0],X_train[:,1],y_train)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
Text(0, 0.5, 'Feature 1')

png

mlp=MLPClassifier(solver='lbfgs',random_state=0,hidden_layer_sizes=[10,10],activation='relu').fit(X_train,y_train)
mglearn.plots.plot_2d_separator(mlp,X_train,fill=True,alpha=0.3)
mglearn.discrete_scatter(X_train[:,0],X_train[:,1],y_train)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
Text(0, 0.5, 'Feature 1')

png

mlp=MLPClassifier(solver='lbfgs',random_state=0,hidden_layer_sizes=[10,10],activation='tanh').fit(X_train,y_train)
mglearn.plots.plot_2d_separator(mlp,X_train,fill=True,alpha=0.3)
mglearn.discrete_scatter(X_train[:,0],X_train[:,1],y_train)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
C:\Users\reion\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:500: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)





Text(0, 0.5, 'Feature 1')

png

fig,axes=plt.subplots(2,4,figsize=(20,8))
for axx,n_hidden_nodes in zip(axes,[10,100]):
    for ax ,alpha in zip(axx,[0.0001,0.01,0.1,1]):
        mlp=MLPClassifier(solver='lbfgs',random_state=0,hidden_layer_sizes=[n_hidden_nodes,n_hidden_nodes],alpha=alpha)
        mlp.fit(X_train,y_train)
        mglearn.plots.plot_2d_separator(mlp,X_train,fill=True,alpha=0.3,ax=ax)
        mglearn.discrete_scatter(X_train[:,0],X_train[:,1],y_train,ax=ax)
        ax.set_title('n_heidden_layer_size=[{},{}]\nalpha={:.4f}'.format(n_hidden_nodes,n_hidden_nodes,alpha))
C:\Users\reion\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:500: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
C:\Users\reion\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:500: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)

png

fig,axes=plt.subplots(2,4,figsize=(20,8))
for i,ax in enumerate(axes.ravel()):
    mlp=MLPClassifier(solver='lbfgs',random_state=i,hidden_layer_sizes=[100,100])
    mlp.fit(X_train,y_train)
    mglearn.plots.plot_2d_separator(mlp,X_train,fill=True,ax=ax,alpha=0.3)
    mglearn.discrete_scatter(X_train[:,0],X_train[:,1],y_train,ax=ax)
C:\Users\reion\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:500: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)

png

print('Cacner data per-feature maxima:\n{}'.format(cancer.data.max(axis=0)))
Cacner data per-feature maxima:
[2.811e+01 3.928e+01 1.885e+02 2.501e+03 1.634e-01 3.454e-01 4.268e-01
 2.012e-01 3.040e-01 9.744e-02 2.873e+00 4.885e+00 2.198e+01 5.422e+02
 3.113e-02 1.354e-01 3.960e-01 5.279e-02 7.895e-02 2.984e-02 3.604e+01
 4.954e+01 2.512e+02 4.254e+03 2.226e-01 1.058e+00 1.252e+00 2.910e-01
 6.638e-01 2.075e-01]
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_state=0)
mlp=MLPClassifier(random_state=42)
mlp.fit(X_train,y_train)
print('Accuracy on training set:{:.3f}'.format(mlp.score(X_train,y_train)))
print('Accuracy on test set:{:.3f}'.format(mlp.score(X_test,y_test)))
Accuracy on training set:0.939
Accuracy on test set:0.923
mean_on_train=X_train.mean(axis=0)
std_on_train=X_train.std(axis=0)
X_train_scaled=(X_train-mean_on_train)/std_on_train
X_test_scaled=(X_test-mean_on_train)/std_on_train
mlp=MLPClassifier(random_state=0)
mlp.fit(X_train_scaled,y_train)
print('Accuracy on training set:{:.3f}'.format(mlp.score(X_train_scaled,y_train)))
print('Accuracy on test set:{:.3f}'.format(mlp.score(X_test_scaled,y_test)))
Accuracy on training set:0.998
Accuracy on test set:0.951


C:\Users\reion\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:614: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(
mlp=MLPClassifier(random_state=0,max_iter=1000)
mlp.fit(X_train_scaled,y_train)
print('Accuracy on training set:{:.3f}'.format(mlp.score(X_train_scaled,y_train)))
print('Accuracy on test set:{:.3f}'.format(mlp.score(X_test_scaled,y_test)))
Accuracy on training set:0.998
Accuracy on test set:0.958
mlp=MLPClassifier(random_state=0,alpha=1,max_iter=1000)
mlp.fit(X_train_scaled,y_train)
print('Accuracy on training set:{:.3f}'.format(mlp.score(X_train_scaled,y_train)))
print('Accuracy on test set:{:.3f}'.format(mlp.score(X_test_scaled,y_test)))
Accuracy on training set:0.991
Accuracy on test set:0.965
plt.figure(figsize=(20,5))
plt.imshow(mlp.coefs_[0],interpolation='none',cmap='viridis')
plt.yticks(range(30),cancer.feature_names)
plt.xlabel('Columns in weight matrix')
plt.ylabel('Input feature')
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x2ea8245ce20>

png

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_circles
X,y=make_circles(noise=0.25,factor=0.5,random_state=1)
y_named=np.array(['blue','red'])[y]
print(y_named)
X_train,X_test,y_train_named,y_test_named,y_train,y_test=train_test_split(X,y_named,y,random_state=0)
gbrt=GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train,y_train_named)
['red' 'red' 'blue' 'red' 'red' 'blue' 'blue' 'red' 'red' 'red' 'red'
 'blue' 'red' 'red' 'red' 'blue' 'blue' 'blue' 'red' 'blue' 'blue' 'red'
 'red' 'red' 'blue' 'blue' 'red' 'blue' 'blue' 'blue' 'red' 'red' 'red'
 'red' 'red' 'blue' 'blue' 'red' 'blue' 'blue' 'red' 'red' 'red' 'blue'
 'red' 'blue' 'blue' 'red' 'blue' 'red' 'blue' 'red' 'blue' 'blue' 'red'
 'blue' 'blue' 'red' 'blue' 'red' 'blue' 'red' 'red' 'blue' 'blue' 'red'
 'blue' 'red' 'blue' 'red' 'red' 'blue' 'red' 'red' 'blue' 'red' 'blue'
 'red' 'red' 'blue' 'blue' 'blue' 'blue' 'blue' 'blue' 'red' 'blue' 'blue'
 'red' 'red' 'blue' 'blue' 'red' 'red' 'blue' 'red' 'blue' 'red' 'blue'
 'blue']





GradientBoostingClassifier(random_state=0)
print('X_train.shape:{}'.format(X_train.shape))
print('Decision function shape:{}'.format(gbrt.decision_function(X_test).shape))
X_train.shape:(75, 2)
Decision function shape:(25,)
print('Decision function:\n{}'.format(gbrt.decision_function(X_test)[:6]))
Decision function:
[ 4.13592603 -1.70169917 -3.95106099 -3.62609552  4.28986642  3.66166081]
print('Threshoulded decision function:\n{}'.format(gbrt.decision_function(X_test)>0))
print('Predictions:\n{}'.format(gbrt.predict(X_test)))
Threshoulded decision function:
[ True False False False  True  True False  True  True  True False  True
  True False  True False False False  True  True  True  True  True False
 False]
Predictions:
['red' 'blue' 'blue' 'blue' 'red' 'red' 'blue' 'red' 'red' 'red' 'blue'
 'red' 'red' 'blue' 'red' 'blue' 'blue' 'blue' 'red' 'red' 'red' 'red'
 'red' 'blue' 'blue']
greator_zero=(gbrt.decision_function(X_test)>0).astype(int)
pred=gbrt.classes_[greator_zero]
print('pred is equal to predictions :{}'.format(np.all(pred==gbrt.predict(X_test))))
pred is equal to predictions :True
decision_function=gbrt.decision_function(X_test)
print('Decision  function minimum:{:.2f},maximum:{:.2f}'.format(np.min(decision_function),np.max(decision_function)))
Decision  function minimum:-7.69,maximum:4.29
fig,axes=plt.subplots(1,2,figsize=(13,5))
mglearn.tools.plot_2d_separator(gbrt,X,ax=axes[0],alpha=0.4,fill=True,cm=mglearn.cm2)
score_image=mglearn.tools.plot_2d_scores(gbrt,X,ax=axes[1],alpha=0.4,cm=mglearn.ReBl)
for ax in axes:
    mglearn.discrete_scatter(X_test[:,0],X_test[:,1],y_test,markers='^',ax=ax)
    mglearn.discrete_scatter(X_train[:,0],X_train[:,1],y_train,markers='o',ax=ax)
    ax.set_xlabel('feature 0')
    ax.set_ylabel('feature 1')
cbar=plt.colorbar(score_image,ax=axes.tolist())
axes[0].legend(['Test class 0','Test Class 1','Train class 0','Train class 1'],ncol=4,loc=(0.1,1.1))
<matplotlib.legend.Legend at 0x2eacd37c430>

png

from sklearn.datasets import load_iris
iris=load_iris()
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target,random_state=42)
gbrt=GradientBoostingClassifier(learning_rate=0.01,random_state=0)
gbrt.fit(X_train,y_train)
GradientBoostingClassifier(learning_rate=0.01, random_state=0)
print('Descision function shape:{}'.format(gbrt.decision_function(X_test).shape))
print('Decision function:\n{}'.format(gbrt.decision_function(X_test)))
Descision function shape:(38, 3)
Decision function:
[[-1.995715    0.04758267 -1.92720695]
 [ 0.06146394 -1.90755736 -1.92793758]
 [-1.99058203 -1.87637861  0.09686725]
 [-1.995715    0.04758267 -1.92720695]
 [-1.99730159 -0.13469108 -1.20341483]
 [ 0.06146394 -1.90755736 -1.92793758]
 [-1.995715    0.04758267 -1.92720695]
 [-1.99677434 -1.87637861  0.09686725]
 [-1.995715    0.04758267 -1.92720695]
 [-1.995715    0.04758267 -1.92720695]
 [-1.99677434 -1.87637861  0.07863156]
 [ 0.06146394 -1.90755736 -1.92793758]
 [ 0.06146394 -1.90755736 -1.92793758]
 [ 0.06146394 -1.90755736 -1.92793758]
 [ 0.06146394 -1.90755736 -1.92793758]
 [-1.995715    0.04758267 -1.92720695]
 [-1.99087515 -1.87637861  0.09686725]
 [-1.995715    0.04758267 -1.92720695]
 [-1.995715    0.04758267 -1.92720695]
 [-1.99087515 -1.87637861  0.09686725]
 [ 0.06146394 -1.90755736 -1.92793758]
 [-1.99730159 -1.86429671  0.04166049]
 [ 0.06146394 -1.90755736 -1.92793758]
 [-1.99087515 -1.87637861  0.09686725]
 [-1.99087515 -1.87637861  0.07863156]
 [-1.99087515 -1.87637861  0.09686725]
 [-1.99058203 -1.87637861  0.09686725]
 [-1.99087515 -1.87637861  0.07863156]
 [ 0.06146394 -1.90755736 -1.92793758]
 [ 0.06146394 -1.90755736 -1.92793758]
 [ 0.06146394 -1.90755736 -1.92793758]
 [ 0.06146394 -1.90755736 -1.92793758]
 [-1.995715    0.04758267 -1.92720695]
 [ 0.06146394 -1.90755736 -1.92793758]
 [ 0.06146394 -1.90755736 -1.92793758]
 [-1.99712219 -1.87637861  0.04166049]
 [-1.995715    0.04758267 -1.92720695]
 [ 0.06146394 -1.90755736 -1.92793758]]
print('Argmax of decision function:\n{}'.format(np.argmax(gbrt.decision_function(X_test),axis=1)))
print('Predictions:\n{}'.format(gbrt.predict(X_test)))
Argmax of decision function:
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]
Predictions:
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]
print('Predictioned probabilities:\n{}'.format(gbrt.predict_proba(X_test)[:6]))
print('Sums{}'.format(gbrt.predict_proba(X_test)[:6].sum(axis=1)))
Predictioned probabilities:
[[0.10217718 0.78840034 0.10942248]
 [0.78347147 0.10936745 0.10716108]
 [0.09818072 0.11005864 0.79176065]
 [0.10217718 0.78840034 0.10942248]
 [0.10360005 0.66723901 0.22916094]
 [0.78347147 0.10936745 0.10716108]]
Sums[1. 1. 1. 1. 1. 1.]
print("Argmax of predicted probabilities:\n{}".format(np.argmax(gbrt.predict_proba(X_test),axis=1)))
print('Predictions:\n{}'.format(gbrt.predict(X_test)))
Argmax of predicted probabilities:
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]
Predictions:
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]
logreg=LogisticRegression()
named_target=iris.target_names[y_train]
logreg.fit(X_train,named_target)
print('Unqiue classes in training data:{}'.format(logreg.classes_))
print('Predictions:{}'.format(logreg.predict(X_test)[:10]))
argmax_dec_func=np.argmax(logreg.decision_function(X_test)[:10],axis=1)
print('argmax of decision function:{}'.format(argmax_dec_func[:10]))
print('argmax combined with classes_:{}'.format(logreg.classes_[argmax_dec_func][:10]))
Unqiue classes in training data:['setosa' 'versicolor' 'virginica']
Predictions:['versicolor' 'setosa' 'virginica' 'versicolor' 'versicolor' 'setosa'
 'versicolor' 'virginica' 'versicolor' 'versicolor']
argmax of decision function:[1 0 2 1 1 0 1 2 1 1]
argmax combined with classes_:['versicolor' 'setosa' 'virginica' 'versicolor' 'versicolor' 'setosa'
 'versicolor' 'virginica' 'versicolor' 'versicolor']


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
mglearn.plots.plot_scaling()

png

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer=load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_state=1)
print(X_train.shape)
print(X_test.shape)
(426, 30)
(143, 30)
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_train)
MinMaxScaler()
X_train_scaled=scaler.transform(X_train)
print('Transformed shape:{}'.format(X_train_scaled.shape))
print('per-feature minimum before scaling:\n{}'.format(X_train.min(axis=0)))
print('per-feature maximun before scaling:\n{}'.format(X_train.max(axis=0)))
print('per_feature after scaling:\n{}'.format(X_train_scaled.min(axis=0)))
print('per_feature after scaling:\n{}'.format(X_train_scaled.max(axis=0)))
Transformed shape:(426, 30)
per-feature minimum before scaling:
[6.981e+00 1.038e+01 4.379e+01 1.435e+02 5.263e-02 2.650e-02 0.000e+00
 0.000e+00 1.167e-01 5.025e-02 1.144e-01 3.602e-01 7.570e-01 6.802e+00
 2.667e-03 3.746e-03 0.000e+00 0.000e+00 7.882e-03 9.502e-04 7.930e+00
 1.249e+01 5.041e+01 1.852e+02 8.409e-02 4.327e-02 0.000e+00 0.000e+00
 1.565e-01 5.504e-02]
per-feature maximun before scaling:
[2.811e+01 3.928e+01 1.885e+02 2.501e+03 1.634e-01 3.454e-01 4.264e-01
 1.913e-01 2.906e-01 9.575e-02 2.873e+00 3.647e+00 2.198e+01 5.422e+02
 3.113e-02 1.354e-01 3.960e-01 5.279e-02 7.895e-02 2.984e-02 3.604e+01
 4.954e+01 2.512e+02 4.254e+03 2.226e-01 1.058e+00 1.252e+00 2.910e-01
 5.774e-01 2.075e-01]
per_feature after scaling:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
per_feature after scaling:
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]
X_test_scaled=scaler.transform(X_test)
print('per_feature after scaling:\n{}'.format(X_test_scaled.min(axis=0)))
print('per_feature after scaling:\n{}'.format(X_test_scaled.max(axis=0)))
per_feature after scaling:
[ 0.07648256 -0.02318339  0.07117684  0.03295864  0.08919383 -0.02232675
  0.          0.         -0.06152961 -0.00637363 -0.00105126  0.00079104
  0.00067851  0.00079567 -0.0335172  -0.01134793  0.          0.
  0.0233157  -0.00191763  0.03635717 -0.01268556  0.03107724  0.01349292
 -0.09327846 -0.01574803  0.          0.          0.00023759  0.01252788]
per_feature after scaling:
[0.8173127  0.76435986 0.84589869 0.68610817 0.83118173 0.89338351
 1.00093809 1.05175118 1.07705578 1.03714286 0.50554629 1.37665815
 0.44117231 0.4224857  0.72596002 0.77972564 0.38762626 0.66054177
 0.75389768 0.75839224 0.80896478 0.88852901 0.75696001 0.66869839
 0.9075879  0.81108275 0.61717252 0.88487973 1.20527441 0.77371114]
from sklearn.datasets import make_blobs
X,_=make_blobs(n_samples=50,centers=5,random_state=4)
fig,axes=plt.subplots(1,3,figsize=(13,4))
axes[0].scatter(X_train[:,0],X_train[:,1],c=mglearn.cm2(0),label='Training set',s=60)
axes[0].scatter(X_test[:,0],X_test[:,1],c=mglearn.cm2(1),label='Test set',s=60,marker='^')
axes[0].legend(loc='upper left')
axes[0].set_title('Original Data')
scaler=MinMaxScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
axes[1].scatter(X_train_scaled[:,0],X_train_scaled[:,1],c=mglearn.cm2(0),label='Training set',s=60)
axes[1].scatter(X_test_scaled[:,0],X_test_scaled[:,1],c=mglearn.cm2(1),label='Training set',s=60,marker='^')
axes[1].set_title('Scaled Data')
#错误示范
test_scaler=MinMaxScaler()
test_scaler.fit(X_test)
X_test_scaled_badly=test_scaler.transform(X_test)
axes[2].scatter(X_train_scaled[:,0],X_train_scaled[:,1],c=mglearn.cm2(0),label='Training set',s=60)
axes[2].scatter(X_test_scaled_badly[:,0],X_test_scaled_badly[:,1],c=mglearn.cm2(1),label='Training set',s=60,marker='^')
axes[2].set_title('Improperly Sacled Data')
for ax in axes:
    ax.set_xlabel('feature 0')
    ax.set_ylabel('feature 1')
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2-D array with a single row if you intend to specify the same RGB or RGBA value for all points.

png

from sklearn.svm import SVC
svm=SVC(C=100)
svm.fit(X_train,y_train)
print('Test set accuracy:{:.2f}'.format(svm.score(X_test,y_test)))
Test set accuracy:0.93
scaler=MinMaxScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
svm.fit(X_train_scaled,y_train)
print('Test set accuracy:{:.2f}'.format(svm.score(X_test_scaled,y_test)))
Test set accuracy:0.97
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
print(X_train_scaled)
svm.fit(X_train_scaled,y_train)
print('SVM test accuracy:{:.2f}'.format(svm.score(X_test_scaled,y_test)))
[[-1.32273784 -0.24339118 -1.30979447 ... -0.98162332 -0.74451786
  -0.17937263]
 [-0.57400445 -1.09616357 -0.55701267 ... -0.75814958 -0.27864647
  -0.09138928]
 [-0.33626585 -0.26650154 -0.29217561 ... -0.0990813  -0.54099725
   0.22428432]
 ...
 [ 0.99850751 -0.01228755  0.97416698 ...  1.16019922  0.00914439
  -0.32281216]
 [-0.38782362 -1.6646785  -0.43967457 ... -0.50778729 -0.23412633
   0.52876005]
 [ 1.4711204  -0.79341782  1.43690881 ...  0.91820225 -0.76200792
  -1.18451579]]
SVM test accuracy:0.97
mglearn.plots.plot_pca_illustration()

png

fig,axes=plt.subplots(15,2,figsize=(10,20))
malignant=cancer.data[cancer.target==0]
benign=cancer.data[cancer.target==1]
ax=axes.ravel()
for i in range(30):
    _,bins=np.histogram(cancer.data[:,i],bins=50)
    ax[i].hist(malignant[:,i],bins=bins,color=mglearn.cm3(0),alpha=0.5)
    ax[i].hist(benign[:,i],bins=bins,color=mglearn.cm3(1),alpha=0.5)
    ax[i].set_title(cancer.feature_names[i])
    ax[i].set_yticks(())
ax[0].set_xlabel('Feature magnitude')
ax[0].set_ylabel('Frequency')
ax[0].legend(['malignant','benign'],loc='best')
fig.tight_layout()

png

scaler=StandardScaler()
scaler.fit(cancer.data)
X_scaled=scaler.transform(cancer.data)
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
pca.fit(X_scaled)
X_pca=pca.transform(X_scaled)
print('Originam shape:{}'.format(str(X_scaled.shape)))
print('Reduced shape:{}',format(str(X_pca.shape)))
Originam shape:(569, 30)
Reduced shape:{} (569, 2)
plt.figure(figsize=(8,8))
mglearn.discrete_scatter(X_pca[:,0],X_pca[:,1],cancer.target)
plt.legend(cancer.target_names,loc='best')
plt.gca().set_aspect('equal')
plt.xlabel('First principal component')
plt.ylabel('Second princial conponent')
Text(0, 0.5, 'Second princial conponent')

png

print('PCA component shape:{}'.format(pca.components_.shape))
PCA component shape:(2, 30)
print('PCA components:\n{}'.format(pca.components_))
PCA components:
[[ 0.21890244  0.10372458  0.22753729  0.22099499  0.14258969  0.23928535
   0.25840048  0.26085376  0.13816696  0.06436335  0.20597878  0.01742803
   0.21132592  0.20286964  0.01453145  0.17039345  0.15358979  0.1834174
   0.04249842  0.10256832  0.22799663  0.10446933  0.23663968  0.22487053
   0.12795256  0.21009588  0.22876753  0.25088597  0.12290456  0.13178394]
 [-0.23385713 -0.05970609 -0.21518136 -0.23107671  0.18611302  0.15189161
   0.06016536 -0.0347675   0.19034877  0.36657547 -0.10555215  0.08997968
  -0.08945723 -0.15229263  0.20443045  0.2327159   0.19720728  0.13032156
   0.183848    0.28009203 -0.21986638 -0.0454673  -0.19987843 -0.21935186
   0.17230435  0.14359317  0.09796411 -0.00825724  0.14188335  0.27533947]]
plt.matshow(pca.components_,cmap='viridis')
plt.yticks([0,1],['First component','Second component'])
plt.colorbar()
plt.xticks(range(len(cancer.feature_names)),cancer.feature_names,rotation=60,ha='left')
plt.xlabel('Feature')
plt.ylabel('Principal components')
Text(0, 0.5, 'Principal components')

png

from sklearn.datasets import fetch_lfw_people
people=fetch_lfw_people(min_faces_per_person=20,resize=0.7,download_if_missing=True)
image_shape=people.images[0].shape
print(image_shape)
fig,axes=plt.subplots(2,5,figsize=(15,8),subplot_kw={'xticks':(),'yticks':()})
for target ,image,ax in zip(people.target,people.images,axes.ravel()):
    ax.imshow(image)
    ax.set_title(people.target_names[target])
(87, 65)

png

print('People.images.shape:{}'.format(people.images.shape))
print('Number of classes:{}'.format(len(people.target_names)))
People.images.shape:(3023, 87, 65)
Number of classes:62
counts=np.bincount(people.target)
for i,(count,name)in enumerate(zip(counts,people.target_names)):
    print('{0:25}{1:3}'.format(name,count),end='  ')
    if (i+1)%3==0:
        print()
Alejandro Toledo          39  Alvaro Uribe              35  Amelie Mauresmo           21  
Andre Agassi              36  Angelina Jolie            20  Ariel Sharon              77  
Arnold Schwarzenegger     42  Atal Bihari Vajpayee      24  Bill Clinton              29  
Carlos Menem              21  Colin Powell             236  David Beckham             31  
Donald Rumsfeld          121  George Robertson          22  George W Bush            530  
Gerhard Schroeder        109  Gloria Macapagal Arroyo   44  Gray Davis                26  
Guillermo Coria           30  Hamid Karzai              22  Hans Blix                 39  
Hugo Chavez               71  Igor Ivanov               20  Jack Straw                28  
Jacques Chirac            52  Jean Chretien             55  Jennifer Aniston          21  
Jennifer Capriati         42  Jennifer Lopez            21  Jeremy Greenstock         24  
Jiang Zemin               20  John Ashcroft             53  John Negroponte           31  
Jose Maria Aznar          23  Juan Carlos Ferrero       28  Junichiro Koizumi         60  
Kofi Annan                32  Laura Bush                41  Lindsay Davenport         22  
Lleyton Hewitt            41  Luiz Inacio Lula da Silva 48  Mahmoud Abbas             29  
Megawati Sukarnoputri     33  Michael Bloomberg         20  Naomi Watts               22  
Nestor Kirchner           37  Paul Bremer               20  Pete Sampras              22  
Recep Tayyip Erdogan      30  Ricardo Lagos             27  Roh Moo-hyun              32  
Rudolph Giuliani          26  Saddam Hussein            23  Serena Williams           52  
Silvio Berlusconi         33  Tiger Woods               23  Tom Daschle               25  
Tom Ridge                 33  Tony Blair               144  Vicente Fox               32  
Vladimir Putin            49  Winona Ryder              24  
mask=np.zeros(people.target.shape,dtype=np.bool)
for target in np.unique(people.target):
    mask[np.where(people.target==target)[0][:50]]=1
X_people=people.data[mask]
y_people=people.target[mask]
X_people=X_people/255.
<ipython-input-151-9dfabe49bc6b>:1: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mask=np.zeros(people.target.shape,dtype=np.bool)
from sklearn.neighbors import KNeighborsClassifier
X_train,X_test,y_train,y_test=train_test_split(X_people,y_people,stratify=y_people,random_state=0)
knn=KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
print('Test set score of 1-nn:{:.2f}'.format(knn.score(X_test,y_test)))
Test set score of 1-nn:0.20
mglearn.plots.plot_pca_whitening()

png

pca=PCA(n_components=100,whiten=True,random_state=0).fit(X_train)
X_train_pca=pca.transform(X_train)
X_test_pca=pca.transform(X_test)
print('X_train_pca.shape:{}'.format(X_train_pca.shape))
X_train_pca.shape:(1547, 100)
knn=KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_pca,y_train)
print("Test set accuracy :{:.2f}".format(knn.score(X_test_pca,y_test)))
Test set accuracy :0.24
print('pca.components_.shape:{}'.format(pca.components_.shape))
pca.components_.shape:(100, 5655)
fix,axes=plt.subplots(3,5,figsize=(15,12),subplot_kw={'xticks':(),'yticks':()})
for i ,(component,ax) in enumerate(zip(pca.components_,axes.ravel())):
    ax.imshow(component.reshape(image_shape),cmap='viridis')
    ax.set_title("{} component".format((i+1)))

png

mglearn.plots.plot_pca_faces(X_train,X_test,image_shape)

png

mglearn.discrete_scatter(X_train_pca[:,0],X_train_pca[:,1],y_train)
plt.xlabel("First principal component")
plt.ylabel('Second principal component')
Text(0, 0.5, 'Second principal component')

png

mglearn.plots.plot_nmf_illustration()
C:\Users\reion\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "
C:\Users\reion\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:1090: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn("Maximum number of iterations %d reached. Increase it to"
C:\Users\reion\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "
C:\Users\reion\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:1090: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn("Maximum number of iterations %d reached. Increase it to"

png

from sklearn.decomposition import NMF
nmf=NMF(n_components=15,random_state=0)
nmf.fit(X_train)
X_train_nmf=nmf.transform(X_train)
X_test_nmf=nmf.transform(X_test)
fig,axes=plt.subplots(3,5,figsize=(15,12))
for i,(component,ax) in enumerate(zip(nmf.components_,axes.ravel())):
    ax.imshow(component.reshape(image_shape))
    ax.set_title('{}.component'.format(i))
C:\Users\reion\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "
C:\Users\reion\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:1090: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn("Maximum number of iterations %d reached. Increase it to"

png

compn=3
inds=np.argsort(X_train_nmf[:,compn])[::-1]
fig,axes=plt.subplots(2,5,figsize=(15,8))
for i ,(ind,ax)in enumerate(zip(inds,axes.ravel())):
    ax.imshow(X_train[ind].reshape(image_shape))
compn=7
inds=np.argsort(X_train_nmf[:,compn])[::-1]
fig,axes=plt.subplots(2,5,figsize=(15,8))
for i,(ind,ax)in enumerate(zip(inds,axes.ravel())):
    ax.imshow(X_train[ind].reshape(image_shape))

png

png

S=mglearn.datasets.make_signals()
print(S.shape)
plt.figure(figsize=(6,1))
plt.plot(S,'-')
plt.xlabel("True")
plt.ylabel("Siginal")
(2000, 3)





Text(0, 0.5, 'Siginal')

png

A=np.random.RandomState(0).uniform(size=(100,3))
X=np.dot(S,A.T)
print('Shape of measurements:{}'.format(S.shape))
Shape of measurements:(2000, 3)
nmf=NMF(n_components=3,random_state=42)
S_=nmf.fit_transform(X)
print('Recovered signal shape:{}'.format(S_.shape))
Recovered signal shape:(2000, 3)


C:\Users\reion\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "
C:\Users\reion\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:1090: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn("Maximum number of iterations %d reached. Increase it to"
pca=PCA(n_components=3)
H=pca.fit_transform(X)
models=[X,S,S_,H]
names=['Observation(first three measurements)','True sources','NMF recovered signals','PCA recovered signal']
fig,axes=plt.subplots(4,figsize=(8,4),gridspec_kw={'hspace':0.5},subplot_kw={'xticks':(),'yticks':()})
for model ,name,ax in zip(models,names,axes):
    ax.set_title(name)
    ax.plot(model[:,:3],'-')

png

from sklearn.datasets import load_digits
digits=load_digits()
fig,axes=plt.subplots(2,5,figsize=(10,5))
for ax,img in zip(axes.ravel(),digits.images):
    ax.imshow(img)

png

pca=PCA(n_components=2)
pca.fit(digits.data)
digits_pca=pca.transform(digits.data)
colors=['#476A2A','#7851B8','#BD3430','#4A2D4E','#875525','#A83683','#4E655E','#853541','#3A3120','#535D8E']
plt.figure(figsize=(10,10))
plt.xlim(digits_pca[:,0].min(),digits_pca[:,0].max())
plt.ylim(digits_pca[:,1].min(),digits_pca[:,1].max())
for i in range(len(digits.data)):
    plt.text(digits_pca[i,0],digits_pca[i,1],str(digits.target[i]),color=colors[digits.target[i]],fontdict={'weight':'bold','size':9})
plt.xlabel('First principal component')
plt.ylabel('Second principal component')
Text(0, 0.5, 'Second principal component')

png

from sklearn.manifold import TSNE
tsne=TSNE(random_state=42)
digits_tsne=tsne.fit_transform(digits.data)
plt.figure(figsize=(10,10))
plt.xlim(digits_tsne[:,0].min()+1,digits_tsne[:,0].max()+1)
plt.ylim(digits_tsne[:,1].min(),digits_tsne[:,1].max()+1)
for i in range(len(digits.data)):
    plt.text(digits_tsne[i,0],digits_tsne[i,1],str(digits.target[i]),fontdict={'weight':'bold','size':9},color=colors[digits.target[i]])
plt.xlabel('t-SNE feature 0')
plt.ylabel('t-SNE feature 1')
Text(0, 0.5, 't-SNE feature 1')

png

mglearn.plots.plot_kmeans_algorithm()

png

mglearn.plots.plot_kmeans_boundaries()

png

from sklearn.datasets import make_blobs 
from sklearn.cluster import KMeans
X,y=make_blobs(random_state=1)
kmeans=KMeans(n_clusters=3)
kmeans.fit(X)
KMeans(n_clusters=3)
print('Cluster membership:\n{}'.format(kmeans.labels_))
Cluster membership:
[0 2 2 2 1 1 1 2 0 0 2 2 1 0 1 1 1 0 2 2 1 2 1 0 2 1 1 0 0 1 0 0 1 0 2 1 2
 2 2 1 1 2 0 2 2 1 0 0 0 0 2 1 1 1 0 1 2 2 0 0 2 1 1 2 2 1 0 1 0 2 2 2 1 0
 0 2 1 1 0 2 0 2 2 1 0 0 0 0 2 0 1 0 0 2 2 1 1 0 1 0]
print(kmeans.predict(X))
[0 2 2 2 1 1 1 2 0 0 2 2 1 0 1 1 1 0 2 2 1 2 1 0 2 1 1 0 0 1 0 0 1 0 2 1 2
 2 2 1 1 2 0 2 2 1 0 0 0 0 2 1 1 1 0 1 2 2 0 0 2 1 1 2 2 1 0 1 0 2 2 2 1 0
 0 2 1 1 0 2 0 2 2 1 0 0 0 0 2 0 1 0 0 2 2 1 1 0 1 0]
mglearn.discrete_scatter(X[:,0],X[:,1],kmeans.labels_,markers='o')
mglearn.discrete_scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],[0,1,2],markers='^',markeredgewidth=3)
[<matplotlib.lines.Line2D at 0x2ea84358e50>,
 <matplotlib.lines.Line2D at 0x2ea843681c0>,
 <matplotlib.lines.Line2D at 0x2ea843684f0>]

png

fig,axes=plt.subplots(1,2,figsize=(10,5))
kmeans=KMeans(n_clusters=2)
kmeans.fit(X)
assignments=kmeans.labels_
mglearn.discrete_scatter(X[:,0],X[:,1],assignments,ax=axes[0])
kmeans=KMeans(n_clusters=5)
kmeans.fit(X)
assignments=kmeans.labels_
mglearn.discrete_scatter(X[:,0],X[:,1],assignments,ax=axes[1])
[<matplotlib.lines.Line2D at 0x2ea867484f0>,
 <matplotlib.lines.Line2D at 0x2ea86748970>,
 <matplotlib.lines.Line2D at 0x2ea8675f1f0>,
 <matplotlib.lines.Line2D at 0x2ea8675f520>,
 <matplotlib.lines.Line2D at 0x2ea8675f850>]

png

X_varied,y_varied=make_blobs(n_samples=200,cluster_std=[1.0,2.5,0.5],random_state=170)
y_pred=KMeans(n_clusters=3,random_state=0).fit_predict(X_varied)
mglearn.discrete_scatter(X_varied[:,0],X_varied[:,1],y_pred)
plt.legend(['cluster 0','cluster 1','cluster 2'],loc='best')
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
Text(0, 0.5, 'Feature 1')

png

X,y=make_blobs(random_state=170,n_samples=600)
rng=np.random.RandomState(74)
transformation=rng.normal(size=(2,2))
X=np.dot(X,transformation)
kmeans=KMeans(n_clusters=3)
kmeans.fit(X)
y_pred=kmeans.predict(X)
plt.scatter(X[:,0],X[:,1],c=y_pred,cmap=mglearn.cm3)
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],marker='^',c=[0,1,2],s=100,linewidth=2,cmap=mglearn.cm3)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
Text(0, 0.5, 'Feature 1')

png

from sklearn.datasets import make_moons
X,y=make_moons(n_samples=200,noise=0.05,random_state=0)
kmeans=KMeans(n_clusters=2)
kmeans.fit(X)
y_pred=kmeans.predict(X)
plt.scatter(X[:,0],X[:,1],c=y_pred,cmap=mglearn.cm2,s=60)
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],marker='^',c=[mglearn.cm2(0),mglearn.cm2(1)],s=100,linewidth=2)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
Text(0, 0.5, 'Feature 1')

png

X_train,X_test,y_train,y_test=train_test_split(X_people,y_people,stratify=y_people,random_state=0)
nmf=NMF(n_components=100,random_state=0)
nmf.fit(X_train)
pca=PCA(n_components=100,random_state=0)
pca.fit(X_train)
kmeans=KMeans(n_clusters=100,random_state=0)
kmeans.fit(X_train)
X_reconstructed_pca=pca.inverse_transform(pca.transform(X_test))
X_reconstructed_kmeans=pca.inverse_transform(pca.transform(X_test))
X_reconstructed_nmf=np.dot(nmf.transform(X_test),nmf.components_)
C:\Users\reion\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:312: FutureWarning: The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).
  warnings.warn(("The 'init' value, when 'init=None' and "
C:\Users\reion\anaconda3\lib\site-packages\sklearn\decomposition\_nmf.py:1090: ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
  warnings.warn("Maximum number of iterations %d reached. Increase it to"
fig,axes=plt.subplots(3,5,figsize=(8,8),subplot_kw={'xticks':(),'yticks':()})
fig.suptitle('Extracted Components')
for ax,comp_kmeans,comp_pca,comp_nmf in zip(axes.T,kmeans.cluster_centers_,pca.components_,nmf.components_):
    ax[0].imshow(comp_kmeans.reshape(image_shape))
    ax[1].imshow(comp_pca.reshape(image_shape))
    ax[2].imshow(comp_nmf.reshape(image_shape),cmap='viridis')
axes[0,0].set_ylabel('kmeans')
axes[1,0].set_ylabel('pca')
axes[2,0].set_ylabel('nmf')
fig,axes=plt.subplots(4,5,figsize=(8,8),subplot_kw={'xticks':(),'yticks':()})
fig.suptitle('Reconstructions')
for ax,orig,rec_means,rec_pca,rec_nmf in zip(axes.T,X_test,X_reconstructed_kmeans,X_reconstructed_pca,X_reconstructed_nmf):
    ax[0].imshow(orig.reshape(image_shape))
    ax[1].imshow(rec_means.reshape(image_shape))
    ax[2].imshow(rec_pca.reshape(image_shape))
    ax[3].imshow(rec_nmf.reshape(image_shape))
axes[0,0].set_ylabel('original')
axes[1,0].set_ylabel('kmeans')
axes[2,0].set_ylabel('pca')
axes[3,0].set_ylabel('nmf')
Text(0, 0.5, 'nmf')

png

png

X,y=make_moons(n_samples=200,noise=0.05,random_state=0)
kmeans=KMeans(n_clusters=10,random_state=0)
kmeans.fit(X)
y_pred=kmeans.predict(X)
plt.scatter(X[:,0],X[:,1],c=y_pred,s=60,cmap='Paired')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],marker='^',c=range(kmeans.n_clusters),linewidth=2,cmap='Paired')
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
print('Cluster memberships:\n{}'.format(y_pred))
Cluster memberships:
[9 2 5 4 2 7 9 6 9 6 1 0 2 6 1 9 3 0 3 1 7 6 8 6 8 5 2 7 5 8 9 8 6 5 3 7 0
 9 4 5 0 1 3 5 2 8 9 1 5 6 1 0 7 4 6 3 3 6 3 8 0 4 2 9 6 4 8 2 8 4 0 4 0 5
 6 4 5 9 3 0 7 8 0 7 5 8 9 8 0 7 3 9 7 1 7 2 2 0 4 5 6 7 8 9 4 5 4 1 2 3 1
 8 8 4 9 2 3 7 0 9 9 1 5 8 5 1 9 5 6 7 9 1 4 0 6 2 6 4 7 9 5 5 3 8 1 9 5 6
 3 5 0 2 9 3 0 8 6 0 3 3 5 6 3 2 0 2 3 0 2 6 3 4 4 1 5 6 7 1 1 3 2 4 7 2 7
 3 8 6 4 1 4 3 9 9 5 1 7 5 8 2]

png

distance_features=kmeans.transform(X)
print('Distance feature shape:{}'.format(distance_features.shape))
print('Distance feature:\n:{}'.format(distance_features))
Distance feature shape:(200, 10)
Distance feature:
:[[0.9220768  1.46553151 1.13956805 ... 1.16559918 1.03852189 0.23340263]
 [1.14159679 2.51721597 0.1199124  ... 0.70700803 2.20414144 0.98271691]
 [0.78786246 0.77354687 1.74914157 ... 1.97061341 0.71561277 0.94399739]
 ...
 [0.44639122 1.10631579 1.48991975 ... 1.79125448 1.03195812 0.81205971]
 [1.38951924 0.79790385 1.98056306 ... 1.97788956 0.23892095 1.05774337]
 [1.14920754 2.4536383  0.04506731 ... 0.57163262 2.11331394 0.88166689]]
mglearn.plots.plot_agglomerative_algorithm()

png

from sklearn.cluster import AgglomerativeClustering
X,y=make_blobs(random_state=1)
agg=AgglomerativeClustering(n_clusters=3)
assignment=agg.fit_predict(X)
mglearn.discrete_scatter(X[:,0],X[:,1],assignment)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
Text(0, 0.5, 'Feature 1')

png

mglearn.plots.plot_agglomerative()

png

from scipy.cluster.hierarchy import dendrogram,ward
X,y=make_blobs(random_state=0,n_samples=12)
linkage_array=ward(X)
dendrogram(linkage_array)
ax=plt.gca()
bounds=ax.get_xbound()
ax.plot(bounds,[7.25,7.25],'--',c='k')
ax.plot(bounds,[4,4],'--',c='k')
ax.text(bounds[1],7.25,'two clusters',va='center',fontdict={'size':15})
ax.text(bounds[1],4,'theree clusters',va='center',fontdict={'size':15})
plt.xlabel('Sample index')
plt.ylabel('Cluster distance')
Text(0, 0.5, 'Cluster distance')

png

from sklearn.cluster import DBSCAN
X,y=make_blobs(n_samples=12,random_state=0)
dbscan=DBSCAN()
clusters=dbscan.fit_predict(X)
print('Cluster memberships:\n{}'.format(clusters))
Cluster memberships:
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
mglearn.plots.plot_dbscan()
min_samples: 2 eps: 1.000000  cluster: [-1  0  0 -1  0 -1  1  1  0  1 -1 -1]
min_samples: 2 eps: 1.500000  cluster: [0 1 1 1 1 0 2 2 1 2 2 0]
min_samples: 2 eps: 2.000000  cluster: [0 1 1 1 1 0 0 0 1 0 0 0]
min_samples: 2 eps: 3.000000  cluster: [0 0 0 0 0 0 0 0 0 0 0 0]
min_samples: 3 eps: 1.000000  cluster: [-1  0  0 -1  0 -1  1  1  0  1 -1 -1]
min_samples: 3 eps: 1.500000  cluster: [0 1 1 1 1 0 2 2 1 2 2 0]
min_samples: 3 eps: 2.000000  cluster: [0 1 1 1 1 0 0 0 1 0 0 0]
min_samples: 3 eps: 3.000000  cluster: [0 0 0 0 0 0 0 0 0 0 0 0]
min_samples: 5 eps: 1.000000  cluster: [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
min_samples: 5 eps: 1.500000  cluster: [-1  0  0  0  0 -1 -1 -1  0 -1 -1 -1]
min_samples: 5 eps: 2.000000  cluster: [-1  0  0  0  0 -1 -1 -1  0 -1 -1 -1]
min_samples: 5 eps: 3.000000  cluster: [0 0 0 0 0 0 0 0 0 0 0 0]

png

X,y=make_moons(n_samples=200,noise=0.05,random_state=0)
scaler=StandardScaler()
scaler.fit(X)
X_scaled=scaler.transform(X)
dbscan=DBSCAN()
clusters=dbscan.fit_predict(X_scaled)
plt.scatter(X_scaled[:,0],X_scaled[:,1],c=clusters,cmap=mglearn.cm2,s=60)
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
Text(0, 0.5, 'Feature 1')

png

from sklearn.metrics import adjusted_rand_score
X,y=make_moons(n_samples=200,noise=0.05,random_state=0)
scaler=StandardScaler()
scaler.fit(X)
X_scaled=scaler.transform(X)
fig,axes=plt.subplots(1,4,figsize=(15,3),subplot_kw={'xticks':(),'yticks':()})
alogorithms=[KMeans(n_clusters=2),AgglomerativeClustering(n_clusters=2),DBSCAN()]
random_state=np.random.RandomState(seed=0)
random_clusters=random_state.randint(low=0,high=2,size=len(X))
axes[0].scatter(X_scaled[:,0],X_scaled[:,1],c=random_clusters,cmap=mglearn.cm3,s=60)
axes[0].set_title('Random assignment - ARI :{:.2f}'.format(adjusted_rand_score(y,random_clusters)))
for ax ,alogorithm in zip(axes[1:],alogorithms):
    clusters=alogorithm.fit_predict(X_scaled)
    ax.scatter(X_scaled[:,0],X_scaled[:,1],c=clusters,cmap=mglearn.cm3,s=60)
    ax.set_title('{}- ARI:{:.2f}'.format(alogorithm.__class__.__name__,adjusted_rand_score(y,clusters)))

png

from sklearn.decomposition import PCA
pca=PCA(n_components=100,whiten=True,random_state=0)
pca.fit_transform(X_people)
X_pca=pca.transform(X_people)
dbscan=DBSCAN()
labels=dbscan.fit_predict(X_pca)
print('Unique label:{}'.format(np.unique(labels)))
Unique label:[-1]
dbscan=DBSCAN(min_samples=3)
labels=dbscan.fit_predict(X_pca)
print('Unique labels:{}'.format(np.unique(labels)))
Unique labels:[-1]
dbscan=DBSCAN(min_samples=3,eps=15)
labels=dbscan.fit_predict(X_pca)
print('Unique labels:{}'.format(np.unique(labels)))
Unique labels:[-1  0]
print('Number of points per cluster:{}'.format(np.bincount(labels+1)))
Number of points per cluster:[  13 2050]
noise=X_people[labels==-1]
fig,axes=plt.subplots(3,9,subplot_kw={'xticks':(),'yticks':()},figsize=(12,4))
for image,ax in zip(noise,axes.ravel()):
    ax.imshow(image.reshape(image_shape),vmin=0,vmax=1)#书上有27个噪声,但在新版本上运行只有13个噪声

png

for eps in [1,3,5,7,9,11,13]:
    print('\nesp={}'.format(eps))
    dbscan=DBSCAN(eps=eps,min_samples=3)
    labels=dbscan.fit_predict(X_pca)
    print('Clusters present :{}'.format(np.unique(labels)))
    print('Clusters size:{}'.format(np.bincount(labels+1)))
esp=1
Clusters present :[-1]
Clusters size:[2063]

esp=3
Clusters present :[-1]
Clusters size:[2063]

esp=5
Clusters present :[-1]
Clusters size:[2063]

esp=7
Clusters present :[-1  0  1  2  3]
Clusters size:[2042   10    4    3    4]

esp=9
Clusters present :[-1  0]
Clusters size:[1401  662]

esp=11
Clusters present :[-1  0]
Clusters size:[ 459 1604]

esp=13
Clusters present :[-1  0]
Clusters size:[  83 1980]
dbscan=DBSCAN(min_samples=3,eps=7)
labels=dbscan.fit_predict(X_pca)
for cluster in range(max(labels)+1):
    mask=labels==cluster
    n_images=np.sum(mask)
    fig,axes=plt.subplots(1,n_images,figsize=(n_images*1.5,4),subplot_kw={'xticks':(),'yticks':()})
    for image,label,ax in zip(X_people[mask],y_people[mask],axes):
        ax.imshow(image.reshape(image_shape))
        ax.set_title(people.target_names[label].split()[-1])

png

png

png

png

km=KMeans(n_clusters=10,random_state=0)
labels_km=km.fit_predict(X_pca)
print('Cluster size k-means:{}'.format(np.bincount(labels_km)))
Cluster size k-means:[124  77 215 177 152 344  78 367 319 210]
fig,axes=plt.subplots(2,5,figsize=(12,4),subplot_kw={'xticks':(),'yticks':()})
for center ,ax in zip(km.cluster_centers_,axes.ravel()):
    ax.imshow(pca.inverse_transform(center).reshape(image_shape),vmin=0,vmax=1)

png

mglearn.plots.plot_kmeans_faces(km,pca,X_pca,X_people,y_people,people.target_names)

png

agglomerative=AgglomerativeClustering(n_clusters=10)
labels_agg=agglomerative.fit_predict(X_pca)
print('Cluster sizes agglomerative clustering:{}'.format(np.bincount(labels_agg)))
Cluster sizes agglomerative clustering:[279 103 291 102  84 987  59  49  53  56]
print('ARI:{:.2f}'.format(adjusted_rand_score(labels_agg,labels_km)))
ARI:0.06
linkage_array=ward(X_pca)
plt.figure(figsize=(20,5))
dendrogram(linkage_array,p=7,truncate_mode='level',no_labels=True)
plt.xlabel('Sample index')
plt.ylabel('Cluster distance')
Text(0, 0.5, 'Cluster distance')

png

n_clusters=10
for cluster in range(n_clusters):
    mask=labels_agg==cluster
    fig,axes=plt.subplots(1,10,subplot_kw={'xticks':(),'yticks':()},figsize=(15,8))
    axes[0].set_ylabel(np.sum(mask))
    for image,label,asdf,ax in zip(X_people[mask],y_people[mask],labels_agg[mask],axes):
        ax.imshow(image.reshape(image_shape),vmin=0,vmax=1)
        ax.set_title(people.target_names[label].split()[-1],fontdict={'fontsize':9})
print(labels_agg)
[2 0 4 ... 2 2 2]

png

png

png

png

png

png

png

png

png

png

agglomerative=AgglomerativeClustering(n_clusters=40)
labels_agg=agglomerative.fit_predict(X_pca)
print('Cluster size agglomerative clustering :{}'.format(np.bincount(labels_agg)))
n_clusters=40
for  cluster in [10,13,19,22,36]:
    mask=labels_agg==cluster
    fig,axes=plt.subplots(1,15,subplot_kw={'xticks':(),'yticks':()},figsize=(15,8))
    cluster_size=np.sum(mask)
    axes[0].set_ylabel('#{}:{}'.format(cluster,cluster_size))
    for image,label,asdf,ax in zip(X_people[mask],y_people[mask],labels_agg[mask],axes):
        ax.imshow(image.reshape(image_shape))
        ax.set_title(people.target_names[label].split()[-1],fontdict={'fontsize':9})
    for i in range(cluster_size,15):
        axes[i].set_visible(False)
Cluster size agglomerative clustering :[291  30  94  96  50  62 287  27  64  56  28  40  31  42  40  98  30  55
  25  37  22  14  13  19  33  22  14  23 113  50  27   7  48  22   6  53
  43  14  18  19]

png

png

png

png

png

import pandas as pd
from IPython.display import display
data=pd.read_csv('./adult.csv',header=None,index_col=False,names=['age','workclass','fnlwgt','education','education-num','marital-status'
                                                                ,'occupation','relationship','race','gender','capital-gain',
                                                                'capital-loss','hours-per-week','native-country','income'])
data=data[['age','workclass','education','gender','hours-per-week','occupation','income']]
display(data.head())
age workclass education gender hours-per-week occupation income
0 39 State-gov Bachelors Male 40 Adm-clerical <=50K
1 50 Self-emp-not-inc Bachelors Male 13 Exec-managerial <=50K
2 38 Private HS-grad Male 40 Handlers-cleaners <=50K
3 53 Private 11th Male 40 Handlers-cleaners <=50K
4 28 Private Bachelors Female 40 Prof-specialty <=50K
print(data.gender.value_counts())
 Male      21790
 Female    10771
Name: gender, dtype: int64
print('Original features:\n',list(data.columns),'\n')
data_dummies=pd.get_dummies(data)
print("Feature after get_duummies:\n",list(data_dummies.columns))
Original features:
 ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income'] 

Feature after get_duummies:
 ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'income_ <=50K', 'income_ >50K']
data_dummies.head()
age hours-per-week workclass_ ? workclass_ Federal-gov workclass_ Local-gov workclass_ Never-worked workclass_ Private workclass_ Self-emp-inc workclass_ Self-emp-not-inc workclass_ State-gov ... occupation_ Machine-op-inspct occupation_ Other-service occupation_ Priv-house-serv occupation_ Prof-specialty occupation_ Protective-serv occupation_ Sales occupation_ Tech-support occupation_ Transport-moving income_ <=50K income_ >50K
0 39 40 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 1 0
1 50 13 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 1 0
2 38 40 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
3 53 40 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
4 28 40 0 0 0 0 1 0 0 0 ... 0 0 0 1 0 0 0 0 1 0

5 rows × 46 columns

features=data_dummies.loc[:,'age':'occupation_ Transport-moving']
X=features.values
y=data_dummies['income_ >50K'].values
print(np.bincount(y))
print(np.bincount(data_dummies['income_ <=50K']))
print('X.shape:{},y.shape:{}'.format(X.shape,y.shape))
[24720  7841]
[ 7841 24720]
X.shape:(32561, 44),y.shape:(32561,)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
logreg=LogisticRegression()
logreg.fit(X_train,y_train)
print('Test score::{:.2f}'.format(logreg.score(X_test,y_test)))
Test score::0.81


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
demo_df=pd.DataFrame({'Integer Feature':[0,1,2,1],'Categorical feature':['socks','fox','socks','box']})
display(demo_df)
Integer Feature Categorical feature
0 0 socks
1 1 fox
2 2 socks
3 1 box
pd.get_dummies(demo_df)
Integer Feature Categorical feature_box Categorical feature_fox Categorical feature_socks
0 0 0 0 1
1 1 0 1 0
2 2 0 0 1
3 1 1 0 0
demo_df['Inter Feature']=demo_df['Integer Feature'].astype(str)
pd.get_dummies(demo_df,columns=['Integer Feature','Categorical feature'])
Inter Feature Integer Feature_0 Integer Feature_1 Integer Feature_2 Categorical feature_box Categorical feature_fox Categorical feature_socks
0 0 1 0 0 0 0 1
1 1 0 1 0 0 1 0
2 2 0 0 1 0 0 1
3 1 0 1 0 1 0 0
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
X,y=mglearn.datasets.make_wave(n_samples=100)
line=np.linspace(-3,3,1000,endpoint=False).reshape(-1,1)
reg=DecisionTreeRegressor(min_samples_split=3).fit(X,y)
plt.plot(line,reg.predict(line),label='decision tree')
reg=LinearRegression().fit(X,y)
plt.plot(line,reg.predict(line),label='linear regression')
plt.plot(X[:,0],y,'o',c='k')
plt.ylabel('Regression output')
plt.xlabel('Input feature')
plt.legend(loc='best')
<matplotlib.legend.Legend at 0x2ea8b91d6a0>

png

bins=np.linspace(-3,3,11)
print('bins:{}'.format(bins))
bins:[-3.  -2.4 -1.8 -1.2 -0.6  0.   0.6  1.2  1.8  2.4  3. ]
which_bins=np.digitize(X,bins=bins)
print('\nData points:\n:{}'.format(X[:5]))
print('\nBin membership for data points:\n',which_bins[:5])
Data points:
:[[-0.75275929]
 [ 2.70428584]
 [ 1.39196365]
 [ 0.59195091]
 [-2.06388816]]

Bin membership for data points:
 [[ 4]
 [10]
 [ 8]
 [ 6]
 [ 2]]
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(sparse=False)
encoder.fit(which_bins)
X_bined=encoder.transform(which_bins)
print(X_bined[:5])
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
print('X_bined.shape:{}'.format(X_bined.shape))
X_bined.shape:(100, 10)
line_bined=encoder.transform(np.digitize(line,bins=bins))
reg=LinearRegression().fit(X_bined,y)
plt.plot(line,reg.predict(line_bined),label='linear regression bined')
reg=DecisionTreeRegressor(min_samples_split=3).fit(X_bined,y)
plt.plot(line,reg.predict(line_bined),label='Decision tree bined')
plt.plot(X[:,0],y,'o',c='k')
plt.vlines(bins,-3,3,linewidth=1,alpha=.2)
plt.legend(loc='best')
plt.ylabel('Regression output')
plt.xlabel('Input feature')
Text(0.5, 0, 'Input feature')

png

X_combined=np.hstack([X,X_bined])
print(X_combined.shape)
(100, 11)
reg=LinearRegression().fit(X_combined,y)
line_combined=np.hstack([line,line_bined])
plt.plot(line,reg.predict(line_combined),label='linear regression combined')
for bin in bins:
    plt.plot([bin,bin],[-3,3],':',c='k')
print(bins)
plt.legend(loc='best')
plt.ylabel('Regression output')
plt.xlabel('Input Feature')
plt.plot(X[:,0],y,'o',c='k')
[-3.  -2.4 -1.8 -1.2 -0.6  0.   0.6  1.2  1.8  2.4  3. ]





[<matplotlib.lines.Line2D at 0x2ea8d6cc3a0>]

png

X_product=np.hstack([X_bined,X * X_bined])
print(X_product.shape)
(100, 20)
reg=LinearRegression().fit(X_product,y)
line_product=np.hstack([line_bined,line*line_bined])
plt.plot(line,reg.predict(line_product),label='linear regression product')
for bin in bins:
    plt.plot([bin,bin],[-3,3],':',c='k')
plt.plot(X[:,0],y,'o',c='k')
plt.ylabel('Regression output')
plt.xlabel('Input Feature')
plt.legend(loc='best')
print(X.shape)
print(y.shape)
(100, 1)
(100,)

png

from sklearn.preprocessing import PolynomialFeatures
poly=PolynomialFeatures(degree=10,include_bias=False)
poly.fit(X)
X_poly=poly.transform(X)
print('X_poly.shape:{}'.format(X_poly.shape))
X_poly.shape:(100, 10)
print('Entries of X:\n{}'.format(X[:5]))
print('Entries of X_poly:\n{}'.format(X_poly[:5]))
Entries of X:
[[-0.75275929]
 [ 2.70428584]
 [ 1.39196365]
 [ 0.59195091]
 [-2.06388816]]
Entries of X_poly:
[[-7.52759287e-01  5.66646544e-01 -4.26548448e-01  3.21088306e-01
  -2.41702204e-01  1.81943579e-01 -1.36959719e-01  1.03097700e-01
  -7.76077513e-02  5.84199555e-02]
 [ 2.70428584e+00  7.31316190e+00  1.97768801e+01  5.34823369e+01
   1.44631526e+02  3.91124988e+02  1.05771377e+03  2.86036036e+03
   7.73523202e+03  2.09182784e+04]
 [ 1.39196365e+00  1.93756281e+00  2.69701700e+00  3.75414962e+00
   5.22563982e+00  7.27390068e+00  1.01250053e+01  1.40936394e+01
   1.96178338e+01  2.73073115e+01]
 [ 5.91950905e-01  3.50405874e-01  2.07423074e-01  1.22784277e-01
   7.26822637e-02  4.30243318e-02  2.54682921e-02  1.50759786e-02
   8.92423917e-03  5.28271146e-03]
 [-2.06388816e+00  4.25963433e+00 -8.79140884e+00  1.81444846e+01
  -3.74481869e+01  7.72888694e+01 -1.59515582e+02  3.29222321e+02
  -6.79478050e+02  1.40236670e+03]]
print('Polynomial feature names:\n{}'.format(poly.get_feature_names()))
Polynomial feature names:
['x0', 'x0^2', 'x0^3', 'x0^4', 'x0^5', 'x0^6', 'x0^7', 'x0^8', 'x0^9', 'x0^10']
reg=LinearRegression().fit(X_poly,y)
line_poly=poly.transform(line)
plt.plot(line,reg.predict(line_poly),label='polynomial linear regression')
plt.plot(X[:,0],y,'o',c='k')
plt.ylabel('Regression output')
plt.xlabel('Input feature')
plt.legend(loc='best')
<matplotlib.legend.Legend at 0x2ea9173edc0>

png

from sklearn.svm import SVR
for gamma in [1,10]:
    svr=SVR(gamma=gamma).fit(X,y)
    plt.plot(line,svr.predict(line),label='SVR gamma={}'.format(gamma))
plt.plot(X[:,0],y,'y',c='k')
plt.ylabel('Regression output')
plt.xlabel('Input feature')
plt.legend(loc='best')
<matplotlib.legend.Legend at 0x2ea92d97a60>

png

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
boston=load_boston()
X_train,X_test,y_train,y_test=train_test_split(boston.data,boston.target,random_state=0)
scaler=MinMaxScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)
poly=PolynomialFeatures(degree=2).fit(X_train_scaled)
X_train_poly=poly.transform(X_train_scaled)
X_test_poly=poly.transform(X_test_scaled)
print('X_train.shape:{}'.format(X_train.shape))
print('X_train_poly.shape:{}'.format(X_train_poly.shape))
X_train.shape:(379, 13)
X_train_poly.shape:(379, 105)
print('Polynomial feature names:\n{}'.format(poly.get_feature_names()))
Polynomial feature names:
['1', 'x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x0^2', 'x0 x1', 'x0 x2', 'x0 x3', 'x0 x4', 'x0 x5', 'x0 x6', 'x0 x7', 'x0 x8', 'x0 x9', 'x0 x10', 'x0 x11', 'x0 x12', 'x1^2', 'x1 x2', 'x1 x3', 'x1 x4', 'x1 x5', 'x1 x6', 'x1 x7', 'x1 x8', 'x1 x9', 'x1 x10', 'x1 x11', 'x1 x12', 'x2^2', 'x2 x3', 'x2 x4', 'x2 x5', 'x2 x6', 'x2 x7', 'x2 x8', 'x2 x9', 'x2 x10', 'x2 x11', 'x2 x12', 'x3^2', 'x3 x4', 'x3 x5', 'x3 x6', 'x3 x7', 'x3 x8', 'x3 x9', 'x3 x10', 'x3 x11', 'x3 x12', 'x4^2', 'x4 x5', 'x4 x6', 'x4 x7', 'x4 x8', 'x4 x9', 'x4 x10', 'x4 x11', 'x4 x12', 'x5^2', 'x5 x6', 'x5 x7', 'x5 x8', 'x5 x9', 'x5 x10', 'x5 x11', 'x5 x12', 'x6^2', 'x6 x7', 'x6 x8', 'x6 x9', 'x6 x10', 'x6 x11', 'x6 x12', 'x7^2', 'x7 x8', 'x7 x9', 'x7 x10', 'x7 x11', 'x7 x12', 'x8^2', 'x8 x9', 'x8 x10', 'x8 x11', 'x8 x12', 'x9^2', 'x9 x10', 'x9 x11', 'x9 x12', 'x10^2', 'x10 x11', 'x10 x12', 'x11^2', 'x11 x12', 'x12^2']
from sklearn.linear_model import Ridge
ridge=Ridge().fit(X_train_scaled,y_train)
print('Score without interations:{:.3f}'.format(ridge.score(X_test_scaled,y_test)))
ridge=Ridge().fit(X_train_poly,y_train)
print('Score with interations:{:.3f}'.format(ridge.score(X_test_poly,y_test)))
Score without interations:0.621
Score with interations:0.753
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(n_estimators=100).fit(X_train_scaled,y_train)
print('Score without interations:{:.3f}'.format(rf.score(X_test_scaled,y_test)))
rf=RandomForestRegressor(n_estimators=100).fit(X_train_poly,y_train)
print('Score with interations :{:.3f}'.format(rf.score(X_test_poly,y_test)))
Score without interations:0.804
Score with interations :0.777
rnd=np.random.RandomState(0)
X_org=rnd.normal(size=(1000,3))
w=rnd.normal(size=3)
X=rnd.poisson(10*np.exp(X_org))
y=np.dot(X_org,w)
print(w.shape)
print(X.shape)
print(y.shape)
(3,)
(1000, 3)
(1000,)
print('Number of feature appearances:\n{}'.format(np.bincount(X[:,0])))
Number of feature appearances:
[28 38 68 48 61 59 45 56 37 40 35 34 36 26 23 26 27 21 23 23 18 21 10  9
 17  9  7 14 12  7  3  8  4  5  5  3  4  2  4  1  1  3  2  5  3  8  2  5
  2  1  2  3  3  2  2  3  3  0  1  2  1  0  0  3  1  0  0  0  1  3  0  1
  0  2  0  1  1  0  0  0  0  1  0  0  2  2  0  1  1  0  0  0  0  1  1  0
  0  0  0  0  0  0  1  0  0  0  0  0  1  1  0  0  1  0  0  0  0  0  0  0
  1  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1]
bins=np.bincount(X[:,0])
plt.bar(range(len(bins)),bins,color='r',alpha=.5)
plt.ylabel('Number of appearance')
plt.xlabel('Value')
Text(0.5, 0, 'Value')

png

from sklearn.linear_model import Ridge
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
score=Ridge().fit(X_train,y_train).score(X_test,y_test)
print('Test score:{:.3f}'.format(score))
Test score:0.622
X_train_log=np.log(X_train+1)
X_test_log=np.log(X_test+1)
plt.hist(X_train_log[:,0],bins=25,color='gray')
plt.ylabel('Numebr of appearance')
plt.xlabel('value')
Text(0.5, 0, 'value')

png

score=Ridge().fit(X_train_log,y_train).score(X_test_log,y_test)
print('Test score:{:.3f}'.format(score))
Test score:0.875
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split
cancer=load_breast_cancer()
rng=np.random.RandomState(0)
noise=rng.normal(size=(len(cancer.data),50))
X_w_noise=np.hstack([cancer.data,noise])
X_train,X_test,y_train,y_test=train_test_split(X_w_noise,cancer.target,random_state=0,test_size=.5)
select=SelectPercentile(percentile=50)
select.fit(X_train,y_train)
X_train_selected=select.transform(X_train)
print('X_train.shape:{}'.format(X_train.shape))
print('X_train_selected.shape{}'.format(X_train_selected.shape))
X_train.shape:(284, 80)
X_train_selected.shape(284, 40)
mask=select.get_support()
print(mask)
plt.matshow(mask.reshape(1,-1),cmap='gray_r')
plt.xlabel('Sample index')
[ True  True  True  True  True  True  True  True  True False  True False
  True  True  True  True  True  True False False  True  True  True  True
  True  True  True  True  True  True False  True False  True  True  True
 False False False False False  True False False False False False False
 False False  True False  True  True False False  True False  True  True
 False False False False  True False  True False False False  True False
 False False False False False False False False]





Text(0.5, 0, 'Sample index')

png

from sklearn.linear_model import LogisticRegression
X_test_selected=select.transform(X_test)
lr=LogisticRegression()
lr.fit(X_train,y_train)
print('Score with all features:{:.3f}'.format(lr.score(X_test,y_test)))
lr.fit(X_train_selected,y_train)
print('Score with only selected feature:{:.3f}'.format(lr.score(X_test_selected,y_test)))
Score with all features:0.937
Score with only selected feature:0.933


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
select=SelectFromModel(RandomForestClassifier(n_estimators=100,random_state=42),threshold='median')
select.fit(X_train,y_train)
X_train_l1=select.transform(X_train)
print('X_train.shape:{}'.format(X_train.shape))
print('X_train_l1.shape:{}'.format(X_train_l1.shape))
X_train.shape:(284, 80)
X_train_l1.shape:(284, 40)
mask=select.get_support()
plt.matshow(mask.reshape(1,-1),cmap='gray_r')
plt.xlabel('Sample index')
Text(0.5, 0, 'Sample index')

png

from sklearn.feature_selection import RFE
select=RFE(RandomForestClassifier(n_estimators=100,random_state=42),n_features_to_select=40)
select.fit(X_train,y_train)
mask=select.get_support()
plt.matshow(mask.reshape(1,-1),cmap='gray_r')
plt.xlabel('Sample index')
Text(0.5, 0, 'Sample index')

png

X_train_rfe=select.transform(X_train)
X_test_rfe=select.transform(X_test)
score=LogisticRegression().fit(X_train_rfe,y_train).score(X_test_rfe,y_test)
print('Test score:{:.3f}'.format(score))
Test score:0.926


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
print('Test score :{:.3f}'.format(select.score(X_test,y_test)))
Test score :0.954
citibike=mglearn.datasets.load_citibike()
print('Citi Bike data:\n{}'.format(citibike.head()))
Citi Bike data:
starttime
2015-08-01 00:00:00     3
2015-08-01 03:00:00     0
2015-08-01 06:00:00     9
2015-08-01 09:00:00    41
2015-08-01 12:00:00    39
Freq: 3H, Name: one, dtype: int64
plt.figure(figsize=(10,3))
xticks=pd.date_range(start=citibike.index.min(),end=citibike.index.max())
plt.xticks(xticks,xticks.strftime('%a %m-%d'),rotation=90,ha='left')
plt.plot(citibike,linewidth=1)
plt.xlabel('Date')
plt.ylabel('Rentals')
print(citibike.index)
DatetimeIndex(['2015-08-01 00:00:00', '2015-08-01 03:00:00',
               '2015-08-01 06:00:00', '2015-08-01 09:00:00',
               '2015-08-01 12:00:00', '2015-08-01 15:00:00',
               '2015-08-01 18:00:00', '2015-08-01 21:00:00',
               '2015-08-02 00:00:00', '2015-08-02 03:00:00',
               ...
               '2015-08-30 18:00:00', '2015-08-30 21:00:00',
               '2015-08-31 00:00:00', '2015-08-31 03:00:00',
               '2015-08-31 06:00:00', '2015-08-31 09:00:00',
               '2015-08-31 12:00:00', '2015-08-31 15:00:00',
               '2015-08-31 18:00:00', '2015-08-31 21:00:00'],
              dtype='datetime64[ns]', name='starttime', length=248, freq='3H')

png

y=citibike.values
X = citibike.index.astype("int64").values.reshape(-1, 1)
X.shape
(248, 1)
n_train=184
def eval_on_features(features,target,regressor):
    X_train,X_test=features[:n_train],features[n_train:]
    y_train,y_test=target[:n_train],target[n_train:]
    regressor.fit(X_train,y_train)
    print('Test-set R^2:{:.2f}'.format(regressor.score(X_test,y_test)))
    y_pred=regressor.predict(X_test)
    y_pred_train=regressor.predict(X_train)
    plt.figure(figsize=(10,3))
    plt.xticks(range(0,len(X),8),xticks.strftime('%a %m-%d'),rotation=90,ha='left')
    plt.plot(range(n_train),y_train,label='train')
    plt.plot(range(n_train,len(y_test)+n_train),y_test,'-',label='test')
    plt.plot(range(n_train),y_pred_train,'--',label='Prediction train')
    plt.plot(range(n_train,len(y_test)+n_train),y_pred,'--',label='prediction test')
    plt.legend(loc=(1.01,0))
    plt.xlabel('Date')
    plt.ylabel('Rentals')
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=100,random_state=0)
plt.figure()
eval_on_features(X,y,regressor)
Test-set R^2:-0.04



<Figure size 432x288 with 0 Axes>

png

X_hour=citibike.index.hour.values.reshape(-1,1)
eval_on_features(X_hour,y,regressor)
Test-set R^2:0.60

png

X_hour_week=np.hstack([citibike.index.dayofweek.values.reshape(-1,1),citibike.index.hour.values.reshape(-1,1)])
eval_on_features(X_hour_week,y,regressor)
Test-set R^2:0.84

png

from sklearn.preprocessing import OneHotEncoder
enc=OneHotEncoder()
X_hour_week_onehot=enc.fit_transform(X_hour_week).toarray()
X_hour_week_onehot
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 1.]])
from sklearn.linear_model import Ridge
eval_on_features(X_hour_week_onehot,y,Ridge())
Test-set R^2:0.62

png

from sklearn.preprocessing import PolynomialFeatures
poly_transform=PolynomialFeatures(degree=2,interaction_only=True,include_bias=False)
X_hour_week_onehot_poly=poly_transform.fit_transform(X_hour_week_onehot)
lr=Ridge()
eval_on_features(X_hour_week_onehot_poly,y,lr)
Test-set R^2:0.85

png

hour=['%02d:00'% i for i in range(0,24,3)]
day=['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
features=day+hour
print(hour)
print(day)
print(features)
['00:00', '03:00', '06:00', '09:00', '12:00', '15:00', '18:00', '21:00']
['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', '00:00', '03:00', '06:00', '09:00', '12:00', '15:00', '18:00', '21:00']
features_poly=poly_transform.get_feature_names(features)
features_nonzero=np.array(features_poly)[lr.coef_!=0]
coef_nonzero=lr.coef_[lr.coef_!=0]
plt.figure(figsize=(15,2))
plt.plot(coef_nonzero,'o')
plt.xticks(np.arange(len(coef_nonzero)),features_nonzero,rotation=90)
plt.xlabel('Features name')
plt.ylabel('Features magnitude')
Text(0, 0.5, 'Features magnitude')

png

from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X,y=make_blobs(random_state=0)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
logreg=LogisticRegression().fit(X_train,y_train)
print('Test set score:{:.2f}'.format(logreg.score(X_test,y_test)))
Test set score:0.88
mglearn.plots.plot_cross_validation()

png

from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
iris=load_iris()
logreg=LogisticRegression()
scores=cross_val_score(logreg,iris.data,iris.target,cv=3)
print('Cross-validation score:{}'.format(scores))
Cross-validation score:[0.98 0.96 0.98]


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
scores=cross_val_score(logreg,iris.data,iris.target,cv=5)
print('Cross-validation score:{}'.format(scores))
Cross-validation score:[0.96666667 1.         0.93333333 0.96666667 1.        ]


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
print('Average cross-validation score:{:.2f}'.format(scores.mean()))
Average cross-validation score:0.97
from sklearn.datasets import load_iris
iris=load_iris()
print('Iris labels:\n{}'.format(iris.target))
Iris labels:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
mglearn.plots.plot_stratified_cross_validation()

png

from sklearn.model_selection import KFold
kfold=KFold(n_splits=5)
print('Cross_validation scores:\n{}'.format(cross_val_score(logreg,iris.data,iris.target,cv=kfold)))
Cross_validation scores:
[1.         1.         0.86666667 0.93333333 0.83333333]


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
kfold=KFold(n_splits=3)
print('Cross_validation scores:\n{}'.format(cross_val_score(logreg,iris.data,iris.target,cv=kfold)))
Cross_validation scores:
[0. 0. 0.]
kfold=KFold(n_splits=3,shuffle=True,random_state=0)
print('Cross_validation scores:\n{}'.format(cross_val_score(logreg,iris.data,iris.target,cv=kfold)))
Cross_validation scores:
[0.98 0.96 0.96]


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
from sklearn.model_selection import LeaveOneOut
loo=LeaveOneOut()
scores=cross_val_score(logreg,iris.data,iris.target,cv=loo)
print('Number of cv iterations:',len(scores))
print('Mean accuracy:{:.2f}'.format(scores.mean()))
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Number of cv iterations: 150
Mean accuracy:0.97


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
mglearn.plots.plot_shuffle_split()

png

from sklearn.model_selection import ShuffleSplit
shuffle_split=ShuffleSplit(test_size=.5,train_size=.5,n_splits=10)
scores=cross_val_score(logreg,iris.data,iris.target,cv=shuffle_split)
print('Cross_validation scores:\n{}'.format(scores))
Cross_validation scores:
[0.94666667 0.93333333 0.97333333 0.93333333 0.97333333 0.96
 0.97333333 0.94666667 0.96       0.96      ]


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
from sklearn.model_selection import GroupKFold
X,y=make_blobs(n_samples=12,random_state=0)
groups=[0,0,0,1,1,1,1,2,2,3,3,3]
scores=cross_val_score(logreg,X,y,groups,cv=GroupKFold(n_splits=3))
print('Cross_validation scores:\n{}'.format(scores))
Cross_validation scores:
[0.75       0.6        0.66666667]


C:\Users\reion\anaconda3\lib\site-packages\sklearn\utils\validation.py:70: FutureWarning: Pass groups=[0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
  warnings.warn(f"Pass {args_msg} as keyword args. From version "
mglearn.plots.plot_group_kfold()

png

from sklearn.svm import SVC
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target,random_state=0)
print('Size of training set:{} size of test set :{}'.format(X_train.shape[0],X_test.shape[0]))
best_score=0
for gamma in [0.001,0.01,0.1,1,10,100]:
    for C in [0.001,0.01,0.1,1,10,100]:
        svm=SVC(gamma=gamma,C=C)
        svm.fit(X_train,y_train)
        score=svm.score(X_test,y_test)
        if score>best_score:
            best_score=score
            best_parameters={'C':C,'gamma':gamma}
print('Best score:{:.2f}'.format(best_score))
print('Best parameters:{}'.format(best_parameters))
Size of training set:112 size of test set :38
Best score:0.97
Best parameters:{'C': 100, 'gamma': 0.001}
mglearn.plots.plot_threefold_split()

png

from sklearn.svm import SVC
X_trainval,X_test,y_trainval,y_test=train_test_split(iris.data,iris.target,random_state=0)
X_train,X_valid,y_train,y_valid=train_test_split(X_trainval,y_trainval,random_state=1)
print('Size of training set:{} size of validation set:{}  size of test set:{}\n'.format(X_train.shape[0],X_valid.shape[0],X_test.shape[0]))
for gamma in [0.001,0.01,0.1,1,10,100]:
    for C in [0.001,0.01,0.1,1,10,100]:
        svm=SVC(gamma=gamma,C=C)
        svm.fit(X_train,y_train)
        score=svm.score(X_valid,y_valid)
        if score>best_score:
            best_score=score
            best_parameters={'C':C,'gamma':gamma}
            
svm=SVC(**best_parameters)
svm.fit(X_trainval,y_trainval)
test_scores=svm.score(X_test,y_test)
print('Best score on validation:{:.2f}'.format(best_score))
print('Best parameters:{}'.format(best_parameters))
print('Test set score with best valiadation{:.2f}'.format(test_scores))
Size of training set:84 size of validation set:28  size of test set:38

Best score on validation:0.97
Best parameters:{'C': 100, 'gamma': 0.001}
Test set score with best valiadation0.97
for gamma in [0.001,0.01,0.1,1,10,100]:
    for C in [0.001,0.01,0.1,1,10,100]:
        svm=SVC(gamma=gamma,C=C)
        svm.fit(X_train,y_train)
        scores=cross_val_score(svm,X_trainval,y_trainval,cv=5)
        score=np.mean(scores)
        if score>best_score:
            best_score=score
            best_parameters={'C':C,'gamma':gamma}
svm=SVC(**best_parameters)
svm.fit(X_trainval,y_trainval)
test_scores=svm.score(X_test,y_test)
print('Best score on validation:{:.2f}'.format(best_score))
print('Best parameters:{}'.format(best_parameters))
print('Test set score with best valiadation{:.2f}'.format(test_scores))            
Best score on validation:0.97
Best parameters:{'C': 100, 'gamma': 0.001}
Test set score with best valiadation0.97
# mglearn.plots.plot_cross_val_selection()
mglearn.plots.plot_grid_search_overview()

png

param_grid={'C':[0.001,0.01,0.1,1,10,100],'gamma':[0.001,0.01,0.1,1,10,100]}
print('Parameter grid:\n{}'.format(param_grid))
Parameter grid:
{'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid_search=GridSearchCV(SVC(),param_grid,cv=5)
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target,random_state=0)
grid_search.fit(X_train,y_train)
GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100]})
print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))
Test set score:0.97
print('Best parameters:{}'.format(grid_search.best_params_))
print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))
Best parameters:{'C': 10, 'gamma': 0.1}
Best cross-validation score:0.97
print('Best estimator:\n{}'.format(grid_search.best_estimator_))
Best estimator:
SVC(C=10, gamma=0.1)
import pandas as pd 
results=pd.DataFrame(grid_search.cv_results_)
display(results)
mean_fit_time std_fit_time mean_score_time std_score_time param_C param_gamma params split0_test_score split1_test_score split2_test_score split3_test_score split4_test_score mean_test_score std_test_score rank_test_score
0 0.000800 4.001146e-04 0.000200 3.993988e-04 0.001 0.001 {'C': 0.001, 'gamma': 0.001} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
1 0.000800 3.999236e-04 0.000200 3.996849e-04 0.001 0.01 {'C': 0.001, 'gamma': 0.01} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
2 0.000600 4.899017e-04 0.000000 0.000000e+00 0.001 0.1 {'C': 0.001, 'gamma': 0.1} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
3 0.000000 0.000000e+00 0.000800 4.000665e-04 0.001 1 {'C': 0.001, 'gamma': 1} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
4 0.000800 3.999949e-04 0.000200 3.996849e-04 0.001 10 {'C': 0.001, 'gamma': 10} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
5 0.000800 4.000666e-04 0.000200 3.994942e-04 0.001 100 {'C': 0.001, 'gamma': 100} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
6 0.000600 4.898625e-04 0.000000 0.000000e+00 0.01 0.001 {'C': 0.01, 'gamma': 0.001} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
7 0.000200 3.995895e-04 0.000200 4.005432e-04 0.01 0.01 {'C': 0.01, 'gamma': 0.01} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
8 0.001000 6.143617e-07 0.000000 0.000000e+00 0.01 0.1 {'C': 0.01, 'gamma': 0.1} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
9 0.000400 4.896872e-04 0.000200 4.000664e-04 0.01 1 {'C': 0.01, 'gamma': 1} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
10 0.000801 4.005718e-04 0.000200 3.999710e-04 0.01 10 {'C': 0.01, 'gamma': 10} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
11 0.000799 3.993803e-04 0.000200 4.002571e-04 0.01 100 {'C': 0.01, 'gamma': 100} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
12 0.000000 0.000000e+00 0.001000 3.989506e-07 0.1 0.001 {'C': 0.1, 'gamma': 0.001} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
13 0.001000 4.672031e-07 0.000000 0.000000e+00 0.1 0.01 {'C': 0.1, 'gamma': 0.01} 0.695652 0.695652 0.681818 0.681818 0.727273 0.696443 0.016610 17
14 0.001000 1.472802e-06 0.000000 0.000000e+00 0.1 0.1 {'C': 0.1, 'gamma': 0.1} 0.913043 0.913043 0.909091 0.863636 0.909091 0.901581 0.019054 16
15 0.001000 3.504023e-07 0.000000 0.000000e+00 0.1 1 {'C': 0.1, 'gamma': 1} 1.000000 0.913043 1.000000 0.909091 0.954545 0.955336 0.039794 6
16 0.000800 4.001626e-04 0.000000 0.000000e+00 0.1 10 {'C': 0.1, 'gamma': 10} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
17 0.000600 4.899014e-04 0.000600 4.899403e-04 0.1 100 {'C': 0.1, 'gamma': 100} 0.347826 0.347826 0.363636 0.363636 0.409091 0.366403 0.022485 22
18 0.001000 3.693565e-07 0.000000 0.000000e+00 1 0.001 {'C': 1, 'gamma': 0.001} 0.695652 0.695652 0.681818 0.681818 0.727273 0.696443 0.016610 17
19 0.001000 5.223489e-07 0.000000 0.000000e+00 1 0.01 {'C': 1, 'gamma': 0.01} 0.913043 0.913043 1.000000 0.909091 0.954545 0.937945 0.035211 11
20 0.000400 4.900379e-04 0.000200 3.999710e-04 1 0.1 {'C': 1, 'gamma': 0.1} 1.000000 0.956522 1.000000 0.909091 0.954545 0.964032 0.033918 3
21 0.000199 3.989220e-04 0.000200 3.997803e-04 1 1 {'C': 1, 'gamma': 1} 0.956522 0.913043 1.000000 0.909091 0.954545 0.946640 0.033305 8
22 0.000800 4.000669e-04 0.000200 3.997803e-04 1 10 {'C': 1, 'gamma': 10} 0.913043 0.956522 1.000000 0.818182 0.954545 0.928458 0.061620 13
23 0.000800 3.999951e-04 0.000400 4.896875e-04 1 100 {'C': 1, 'gamma': 100} 0.391304 0.434783 0.545455 0.500000 0.636364 0.501581 0.085693 21
24 0.000200 3.999710e-04 0.000200 4.000664e-04 10 0.001 {'C': 10, 'gamma': 0.001} 0.913043 0.913043 1.000000 0.909091 0.954545 0.937945 0.035211 11
25 0.000400 4.899208e-04 0.000200 4.000664e-04 10 0.01 {'C': 10, 'gamma': 0.01} 1.000000 0.956522 1.000000 0.909091 0.954545 0.964032 0.033918 3
26 0.000400 4.899208e-04 0.000200 3.997803e-04 10 0.1 {'C': 10, 'gamma': 0.1} 1.000000 0.956522 1.000000 0.954545 0.954545 0.973123 0.021957 1
27 0.000400 4.902131e-04 0.000200 3.998756e-04 10 1 {'C': 10, 'gamma': 1} 0.956522 0.956522 1.000000 0.863636 0.954545 0.946245 0.044708 9
28 0.000600 4.897067e-04 0.000208 4.151344e-04 10 10 {'C': 10, 'gamma': 10} 0.869565 0.913043 1.000000 0.818182 0.954545 0.911067 0.063488 14
29 0.000795 3.974699e-04 0.000400 4.898624e-04 10 100 {'C': 10, 'gamma': 100} 0.521739 0.521739 0.590909 0.590909 0.681818 0.581423 0.058964 19
30 0.000400 4.896874e-04 0.000200 3.997803e-04 100 0.001 {'C': 100, 'gamma': 0.001} 1.000000 0.956522 1.000000 0.909091 0.954545 0.964032 0.033918 3
31 0.000400 4.898040e-04 0.000000 0.000000e+00 100 0.01 {'C': 100, 'gamma': 0.01} 1.000000 0.913043 1.000000 0.954545 0.954545 0.964427 0.032761 2
32 0.000200 3.990173e-04 0.000200 4.001617e-04 100 0.1 {'C': 100, 'gamma': 0.1} 1.000000 0.956522 1.000000 0.863636 0.954545 0.954941 0.049799 7
33 0.000398 4.876012e-04 0.000305 3.986543e-04 100 1 {'C': 100, 'gamma': 1} 0.956522 0.956522 1.000000 0.863636 0.954545 0.946245 0.044708 9
34 0.000600 4.897458e-04 0.000200 3.999710e-04 100 10 {'C': 100, 'gamma': 10} 0.869565 0.913043 1.000000 0.818182 0.954545 0.911067 0.063488 14
35 0.000800 4.002100e-04 0.000200 4.000664e-04 100 100 {'C': 100, 'gamma': 100} 0.521739 0.521739 0.590909 0.590909 0.681818 0.581423 0.058964 19
scores=np.array(results.mean_test_score).reshape(6,6)
mglearn.tools.heatmap(scores,xlabel='gamma',xticklabels=param_grid['gamma'],ylabel='C',yticklabels=param_grid['C'],cmap='viridis')
<matplotlib.collections.PolyCollection at 0x2ea8db41f40>

png

'''错误示范'''

fig,axes=plt.subplots(1,3,figsize=(13,5))
param_grid_linear={'C':np.linspace(1,2,6),'gamma':np.linspace(1,2,6)}
param_grid_one_log={'C':np.linspace(1,2,6),'gamma':np.logspace(-3,2,6)}
param_grid_range={'C':np.logspace(-3,2,6),'gamma':np.logspace(-7,-2,6)}
for param_grid,ax in zip([param_grid_linear,param_grid_one_log,param_grid_range],axes):
    grid_search=GridSearchCV(SVC(),param_grid,cv=5)
    grid_search.fit(X_train,y_train)
    scores=grid_search.cv_results_['mean_test_score'].reshape(6,6)
    scores_img=mglearn.tools.heatmap(scores,xlabel='gamma',ylabel='C',xticklabels=param_grid['gamma'],yticklabels=param_grid['C'],cmap='viridis',ax=ax)
plt.colorbar(scores_img,ax=axes.tolist())
<matplotlib.colorbar.Colorbar at 0x2ea86290670>

png

param_grid=[{'kernel':['rbf'],'C':[0.001,0.01,0.1,1,10,100],'gamma':[0.001,0.01,0.1,1,10,100]},
            {'kernel':['linear'],'C':[0.001,0.01,0.1,1,10,100],'gamma':[0.001,0.01,0.1,1,10,100]}]
print('List of grids:\n{}'.format(param_grid))
List of grids:
[{'kernel': ['rbf'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}, {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}]
grid_search=GridSearchCV(SVC(),param_grid,cv=5)
grid_search.fit(X_train,y_train)
print('Best parameters:{}'.format(grid_search.best_params_))
print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))
Best parameters:{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best cross-validation score:0.97
results=pd.DataFrame(grid_search.cv_results_)
display(results.T)
0 1 2 3 4 5 6 7 8 9 ... 62 63 64 65 66 67 68 69 70 71
mean_fit_time 0.001 0.0002 0.000599 0.001 0.0004 0.0006 0.0002 0.001 0.001 0.0 ... 0.0004 0.0 0.0004 0.0002 0.000599 0.0002 0.000601 0.0002 0.0004 0.000202
std_fit_time 0.000001 0.0004 0.000489 0.000001 0.00049 0.00049 0.0004 0.0 0.0 0.0 ... 0.00049 0.0 0.00049 0.0004 0.000489 0.0004 0.00049 0.000401 0.00049 0.000403
mean_score_time 0.0002 0.0 0.000401 0.0 0.0 0.0004 0.0008 0.0 0.0 0.0006 ... 0.0004 0.0002 0.0002 0.0 0.0002 0.0002 0.000199 0.0002 0.0002 0.0004
std_score_time 0.0004 0.0 0.000491 0.0 0.0 0.00049 0.0004 0.0 0.0 0.00049 ... 0.00049 0.0004 0.0004 0.0 0.000399 0.0004 0.000399 0.0004 0.0004 0.00049
param_C 0.001 0.001 0.001 0.001 0.001 0.001 0.01 0.01 0.01 0.01 ... 10 10 10 10 100 100 100 100 100 100
param_gamma 0.001 0.01 0.1 1 10 100 0.001 0.01 0.1 1 ... 0.1 1 10 100 0.001 0.01 0.1 1 10 100
param_kernel rbf rbf rbf rbf rbf rbf rbf rbf rbf rbf ... linear linear linear linear linear linear linear linear linear linear
params {'C': 0.001, 'gamma': 0.001, 'kernel': 'rbf'} {'C': 0.001, 'gamma': 0.01, 'kernel': 'rbf'} {'C': 0.001, 'gamma': 0.1, 'kernel': 'rbf'} {'C': 0.001, 'gamma': 1, 'kernel': 'rbf'} {'C': 0.001, 'gamma': 10, 'kernel': 'rbf'} {'C': 0.001, 'gamma': 100, 'kernel': 'rbf'} {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'} {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'} {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'} {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'} ... {'C': 10, 'gamma': 0.1, 'kernel': 'linear'} {'C': 10, 'gamma': 1, 'kernel': 'linear'} {'C': 10, 'gamma': 10, 'kernel': 'linear'} {'C': 10, 'gamma': 100, 'kernel': 'linear'} {'C': 100, 'gamma': 0.001, 'kernel': 'linear'} {'C': 100, 'gamma': 0.01, 'kernel': 'linear'} {'C': 100, 'gamma': 0.1, 'kernel': 'linear'} {'C': 100, 'gamma': 1, 'kernel': 'linear'} {'C': 100, 'gamma': 10, 'kernel': 'linear'} {'C': 100, 'gamma': 100, 'kernel': 'linear'}
split0_test_score 0.347826 0.347826 0.347826 0.347826 0.347826 0.347826 0.347826 0.347826 0.347826 0.347826 ... 1.0 1.0 1.0 1.0 0.956522 0.956522 0.956522 0.956522 0.956522 0.956522
split1_test_score 0.347826 0.347826 0.347826 0.347826 0.347826 0.347826 0.347826 0.347826 0.347826 0.347826 ... 1.0 1.0 1.0 1.0 0.956522 0.956522 0.956522 0.956522 0.956522 0.956522
split2_test_score 0.363636 0.363636 0.363636 0.363636 0.363636 0.363636 0.363636 0.363636 0.363636 0.363636 ... 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
split3_test_score 0.363636 0.363636 0.363636 0.363636 0.363636 0.363636 0.363636 0.363636 0.363636 0.363636 ... 0.909091 0.909091 0.909091 0.909091 0.909091 0.909091 0.909091 0.909091 0.909091 0.909091
split4_test_score 0.409091 0.409091 0.409091 0.409091 0.409091 0.409091 0.409091 0.409091 0.409091 0.409091 ... 0.954545 0.954545 0.954545 0.954545 0.954545 0.954545 0.954545 0.954545 0.954545 0.954545
mean_test_score 0.366403 0.366403 0.366403 0.366403 0.366403 0.366403 0.366403 0.366403 0.366403 0.366403 ... 0.972727 0.972727 0.972727 0.972727 0.955336 0.955336 0.955336 0.955336 0.955336 0.955336
std_test_score 0.022485 0.022485 0.022485 0.022485 0.022485 0.022485 0.022485 0.022485 0.022485 0.022485 ... 0.036364 0.036364 0.036364 0.036364 0.028764 0.028764 0.028764 0.028764 0.028764 0.028764
rank_test_score 52 52 52 52 52 52 52 52 52 52 ... 8 8 8 8 18 18 18 18 18 18

16 rows × 72 columns

scores=cross_val_score(GridSearchCV(SVC(),param_grid,cv=5),iris.data,iris.target)
print('Cross-validation scores',scores)
print('Mean cross-validation score:',scores.mean())
Cross-validation scores [0.96666667 1.         0.9        0.96666667 1.        ]
Mean cross-validation score: 0.9666666666666668
def nested_cv(X,y,inner_cv,outer_cv,Classifier,parameter_grid):
    outer_scores=[]
    for training_samples,test_samples in outer_cv.split(X,y):
        best_parms=[]
        best_score=-np.inf
        for parameters in parameter_grid:
            cv_scores=[]
            for inner_train,inner_test in inner_cv.split(X[training_samples],y[training_samples]):
                clf=Classifier(**parameters)
                clf.fit(X[inner_train],y[inner_train])
                score=clf.score(X[inner_train],y[inner_train])
                cv_scores.append(score)
                mean_score=np.mean(cv_scores)
                if mean_score>best_score:
                    best_score=mean_score
                    best_params=parameters
        clf=Classifier(**best_params)
        clf.fit(X[training_samples],y[training_samples])
        outer_scores.append(clf.score(X[test_samples],y[test_samples]))
    return np.array(outer_scores)
from sklearn.model_selection import ParameterGrid,StratifiedKFold
scores=nested_cv(iris.data,iris.target,StratifiedKFold(5),StratifiedKFold(5),SVC,ParameterGrid(param_grid))
print('Cross-validation scores:{}'.format(scores))
Cross-validation scores:[0.96666667 1.         0.9        0.96666667 1.        ]
from sklearn.datasets import load_digits
digits=load_digits()
y=digits.target==9
X_train,X_test,y_train,y_test=train_test_split(digits.data,y,random_state=0)
from sklearn.dummy import DummyClassifier
dummy_majority=DummyClassifier(strategy='most_frequent').fit(X_train,y_train)
pred_most_frequent=dummy_majority.predict(X_test)
print('Unique predicted labels:{}'.format(np.unique(pred_most_frequent)))
print('Test score{:.2f}'.format(dummy_majority.score(X_test,y_test)))
Unique predicted labels:[False]
Test score0.90
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier(max_depth=2).fit(X_train,y_train)
pred_tree=tree.predict(X_test)
print('Test score:{:.2f}'.format(tree.score(X_test,y_test)))
Test score:0.92
from sklearn.linear_model import LogisticRegression
dummy=DummyClassifier().fit(X_train,y_train)
pred_dummy=dummy.predict(X_test)
print('Dummy score{:.2f}'.format(dummy.score(X_test,y_test)))
logreg=LogisticRegression(C=0.1).fit(X_train,y_train)
pred_logreg=logreg.predict(X_test)
print('logreg score{:.2f}'.format(logreg.score(X_test,y_test)))
Dummy score0.90
logreg score0.98


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
from sklearn.metrics import confusion_matrix
confusion=confusion_matrix(y_test,pred_logreg)
print('Confusion matrix:\n{}'.format(confusion))
Confusion matrix:
[[402   1]
 [  6  41]]
mglearn.plots.plot_confusion_matrix_illustration()

png

mglearn.plots.plot_binary_confusion_matrix()

png

print('\nMost frequent class:')
print(confusion_matrix(y_test,pred_most_frequent))
print('\nDummy model:')
print(confusion_matrix(y_test,pred_dummy))
print('\nDecision tree:')
print(confusion_matrix(y_test,pred_tree))
print('\nLogisticRegression regression')
print(confusion_matrix(y_test,pred_logreg))
Most frequent class:
[[403   0]
 [ 47   0]]

Dummy model:
[[403   0]
 [ 47   0]]

Decision tree:
[[390  13]
 [ 24  23]]

LogisticRegression regression
[[402   1]
 [  6  41]]
from sklearn.metrics import f1_score
print('f1 score most frequent:{:.2f}'.format(f1_score(y_test,pred_most_frequent)))
print('f1 score dummy{:.2f}'.format(f1_score(y_test,pred_dummy)))
print('f1 score tree{:.2f}'.format(f1_score(y_test,pred_tree)))
print('f1 score logreg{:.2f}'.format(f1_score(y_test,pred_logreg)))

f1 score most frequent:0.00
f1 score dummy0.00
f1 score tree0.55
f1 score logreg0.92
from sklearn.metrics import classification_report
print(classification_report(y_test,pred_most_frequent,target_names=['not nine','nine']))
              precision    recall  f1-score   support

    not nine       0.90      1.00      0.94       403
        nine       0.00      0.00      0.00        47

    accuracy                           0.90       450
   macro avg       0.45      0.50      0.47       450
weighted avg       0.80      0.90      0.85       450



C:\Users\reion\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\reion\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\reion\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
print(classification_report(y_test,pred_dummy,target_names=['not nine','nine']))
              precision    recall  f1-score   support

    not nine       0.90      1.00      0.94       403
        nine       0.00      0.00      0.00        47

    accuracy                           0.90       450
   macro avg       0.45      0.50      0.47       450
weighted avg       0.80      0.90      0.85       450



C:\Users\reion\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\reion\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\reion\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1248: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
print(classification_report(y_test,pred_logreg,target_names=['not nine','nine']))
              precision    recall  f1-score   support

    not nine       0.99      1.00      0.99       403
        nine       0.98      0.87      0.92        47

    accuracy                           0.98       450
   macro avg       0.98      0.93      0.96       450
weighted avg       0.98      0.98      0.98       450
from mglearn.datasets import make_blobs 
from sklearn.svm import SVC
X,y=make_blobs(n_samples=(400,50),centers=2,cluster_std=[7.0,2],random_state=42)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
svc=SVC(gamma=.05).fit(X_train,y_train)
print(X.shape)
(450, 2)


C:\Users\reion\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:86: FutureWarning: Function make_blobs is deprecated; Please import make_blobs directly from scikit-learn
  warnings.warn(msg, category=FutureWarning)
mglearn.plots.plot_decision_threshold()

png

from sklearn.metrics import classification_report
print(classification_report(y_test,svc.predict(X_test)))
              precision    recall  f1-score   support

           0       0.99      0.93      0.96       107
           1       0.42      0.83      0.56         6

    accuracy                           0.93       113
   macro avg       0.70      0.88      0.76       113
weighted avg       0.96      0.93      0.94       113
y_pred_low_threshold=svc.decision_function(X_test)>-.8
print(classification_report(y_test,y_pred_low_threshold))
              precision    recall  f1-score   support

           0       1.00      0.89      0.94       107
           1       0.33      1.00      0.50         6

    accuracy                           0.89       113
   macro avg       0.67      0.94      0.72       113
weighted avg       0.96      0.89      0.92       113
from sklearn.metrics import precision_recall_curve,f1_score
precision,recall,thresholds=precision_recall_curve(y_test,svc.decision_function(X_test))
X,y=make_blobs(n_samples=(4000,500),centers=2,cluster_std=[7.0,2],random_state=22)
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0)
svc=SVC(gamma=.05).fit(X_train,y_train)
precision,recall,theresholds=precision_recall_curve(y_test,svc.decision_function(X_test))
close_zero=np.argmin(np.abs(thresholds))
plt.plot(precision[close_zero],recall[close_zero],'o',markersize=10,label='threshold zero',fillstyle='none',c='k',mew=2)
plt.plot(precision,recall,label='precision recall curve')
plt.xlabel('Precision')
plt.ylabel('Recall')
C:\Users\reion\anaconda3\lib\site-packages\sklearn\utils\deprecation.py:86: FutureWarning: Function make_blobs is deprecated; Please import make_blobs directly from scikit-learn
  warnings.warn(msg, category=FutureWarning)





Text(0, 0.5, 'Recall')

png

from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=100,random_state=0,max_features=2)
rf.fit(X_train,y_train)
precision_rf,recall_rf,thresholds_rf=precision_recall_curve(y_test,rf.predict_proba(X_test)[:,1])
plt.plot(precision,recall,label='svc')
plt.plot(precision[close_zero],recall[close_zero],'o',markersize=10,label='thresholds zero svc',fillstyle='none',c='k',mew=2)
plt.plot(precision_rf,recall_rf,label='rf')
close_default_rf=np.argmin(np.abs(thresholds_rf-0.5))
plt.plot(precision_rf[close_default_rf],recall_rf[close_default_rf],'^',c='k',markersize=10,label='threshold 0.5 rf',fillstyle='none',mew=2)
plt.xlabel('Precision')
plt.ylabel('Recall')
plt.legend(loc='best')
<matplotlib.legend.Legend at 0x2ea95b7f940>

png

print('f1 score of random forest:{:.3f}'.format(f1_score(y_test,rf.predict(X_test))))
print('f1 score of svc:{:.3f}'.format(f1_score(y_test,svc.predict(X_test))))
f1 score of random forest:0.610
f1 score of svc:0.656
from sklearn.metrics import average_precision_score
ap_rf=average_precision_score(y_test,rf.predict_proba(X_test)[:,1])
ap_svc=average_precision_score(y_test,svc.decision_function(X_test))
print('Average precision of random forest:{:.3f}'.format(ap_rf))
print('Average precision of svc:{:.3f}'.format(ap_svc))
Average precision of random forest:0.660
Average precision of svc:0.666
from sklearn.metrics import roc_curve
fpr,tpr,thresholds=roc_curve(y_test,svc.decision_function(X_test))
plt.plot(fpr,tpr,label='ROC Curve')
plt.xlabel('FPR')
plt.ylabel('TPR(recall)')
close_zero=np.argmin(np.abs(thresholds))
plt.plot(fpr[close_zero],tpr[close_zero],'o',markersize=10,label='threshold zero',fillstyle='none',c='k',mew=2)
plt.legend()
<matplotlib.legend.Legend at 0x2ea838e29d0>

png

from sklearn.metrics import roc_curve
fpr_rf,tpr_rf,thresholds_rf=roc_curve(y_test,rf.predict_proba(X_test)[:,1])
plt.plot(fpr,tpr,label='ROC Curve SVC')
plt.plot(fpr_rf,tpr_rf,label='ROC Curve RF')
plt.xlabel('FPR')
plt.ylabel('TPR(RECALL)')
plt.plot(fpr[close_zero],tpr[close_zero],'o',markersize=10,label='threshold zero',fillstyle='none',c='k',mew=2)
close_default_rf=np.argmin(np.abs(thresholds_rf-0.5))
plt.plot(fpr_rf[close_default_rf],tpr_rf[close_default_rf],'^',markersize=10,label='threshold 0.5 RF',fillstyle='none',c='k',mew=2)

[<matplotlib.lines.Line2D at 0x2ea91679ca0>]

png

from sklearn.metrics import roc_auc_score
rf_auc=roc_auc_score(y_test,rf.predict_proba(X_test)[:,1])
svc_auc=roc_auc_score(y_test,svc.decision_function(X_test))
print('AUC for Random Forest:{:.3f}'.format(rf_auc))
print('AUC for SVC :{:.3f}'.format(svc_auc))
AUC for Random Forest:0.937
AUC for SVC :0.916
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score,roc_curve
digits=load_digits()
y=digits.target==9
X_train,X_test,y_train,y_test=train_test_split(digits.data,y,random_state=0)
plt.figure()
for gamma in [1,0.05,0.01]:
    svc=SVC(gamma=gamma).fit(X_train,y_train)
    accuracy=svc.score(X_test,y_test)
    auc=roc_auc_score(y_test,svc.decision_function(X_test))
    fpr,tpr,_=roc_curve(y_test,svc.decision_function(X_test))
    print('gamma={:.2f} accuracy={:.2f} AUC={:.2f}'.format(gamma,accuracy,auc))
    plt.plot(fpr,tpr,label='gamma={:.3f}'.format(gamma))
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.xlim(-0.01,1)
plt.ylim(0,1.02)
plt.legend(loc='best')
gamma=1.00 accuracy=0.90 AUC=0.50
gamma=0.05 accuracy=0.90 AUC=1.00
gamma=0.01 accuracy=0.90 AUC=1.00





<matplotlib.legend.Legend at 0x2ea916a4820>

png

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
X_train,X_test,y_train,y_test=train_test_split(digits.data,digits.target,random_state=0)
lr=LogisticRegression().fit(X_train,y_train)
pred=lr.predict(X_test)
print('Accuracy :{:.3f}'.format(accuracy_score(y_test,pred)))
print('Confusion matrix:\n{}'.format(confusion_matrix(y_test,pred)))
Accuracy :0.951
Confusion matrix:
[[37  0  0  0  0  0  0  0  0  0]
 [ 0 40  0  0  0  0  0  0  2  1]
 [ 0  1 40  3  0  0  0  0  0  0]
 [ 0  0  0 43  0  0  0  0  1  1]
 [ 0  0  0  0 37  0  0  1  0  0]
 [ 0  0  0  0  0 46  0  0  0  2]
 [ 0  1  0  0  0  0 51  0  0  0]
 [ 0  0  0  1  1  0  0 46  0  0]
 [ 0  3  1  0  0  0  0  0 43  1]
 [ 0  0  0  0  0  1  0  0  1 45]]


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
score_image=mglearn.tools.heatmap(
confusion_matrix(y_test,pred),xlabel='Predicted label',ylabel='True label',xticklabels=digits.target_names,
yticklabels=digits.target_names,cmap=plt.cm.gray_r,fmt='%d')
plt.title('Confusion matrix')
plt.gca().invert_yaxis()

png

print(classification_report(y_test,pred))
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        37
           1       0.89      0.93      0.91        43
           2       0.98      0.91      0.94        44
           3       0.91      0.96      0.93        45
           4       0.97      0.97      0.97        38
           5       0.98      0.96      0.97        48
           6       1.00      0.98      0.99        52
           7       0.98      0.96      0.97        48
           8       0.91      0.90      0.91        48
           9       0.90      0.96      0.93        47

    accuracy                           0.95       450
   macro avg       0.95      0.95      0.95       450
weighted avg       0.95      0.95      0.95       450
print('Micro average f1 score :{:.3f}'.format(f1_score(y_test,pred,average='micro')))
print('Macro average f1 score:{:.3f}'.format(f1_score(y_test,pred,average='macro')))
Micro average f1 score :0.951
Macro average f1 score:0.952
from sklearn.model_selection import cross_val_score,GridSearchCV
print('Default scoring:{}'.format(cross_val_score(SVC(),digits.data,digits.target==9)))
explicit_accuracy=cross_val_score(SVC(),digits.data,digits.target==9,scoring='accuracy')
print('Explicit accuracy scoring:{}'.format(explicit_accuracy))
roc_auc=cross_val_score(SVC(),digits.data,digits.target==9,scoring='roc_auc')
print('AUC scoring:{}'.format(roc_auc))
Default scoring:[0.975      0.99166667 1.         0.99442897 0.98050139]
Explicit accuracy scoring:[0.975      0.99166667 1.         0.99442897 0.98050139]
AUC scoring:[0.99717078 0.99854252 1.         0.999828   0.98400413]
X_train,X_test,y_train,y_test=train_test_split(digits.data,digits.target==9,random_state=0)
param_grid={'gamma':[0.0001,0.001,0.1,1,10]}
grid=GridSearchCV(SVC(),param_grid=param_grid)
grid.fit(X_train,y_train)
print('Grid-Seach with accuracy')
print('Best parameters:',grid.best_params_)
print('Best cross_val_validation score(accuracy)):{:.3f}'.format(grid.best_score_))
print('Test set AUC:{:.3f}'.format(roc_auc_score(y_test,grid.decision_function(X_test))))
print('Test set accuracy :{:.3f}'.format(grid.score(X_test,y_test)))
Grid-Seach with accuracy
Best parameters: {'gamma': 0.001}
Best cross_val_validation score(accuracy)):0.996
Test set AUC:1.000
Test set accuracy :0.991
grid=GridSearchCV(SVC(),param_grid=param_grid,scoring='roc_auc')
grid.fit(X_train,y_train)
print('\nGrid-Search with AUC')
print('Best parameters:',grid.best_params_)
print('Best cross_val_validation score(accuracy)):{:.3f}'.format(grid.best_score_))
print('Test set AUC:{:.3f}'.format(roc_auc_score(y_test,grid.decision_function(X_test))))
print('Test set accuracy :{:.3f}'.format(grid.score(X_test,y_test)))
Grid-Search with AUC
Best parameters: {'gamma': 0.001}
Best cross_val_validation score(accuracy)):0.999
Test set AUC:1.000
Test set accuracy :1.000
# from sklearn.metrics.scorer import SCORERS
# print('Available scores:\n{}'.format(sorted(SCORRES.keys())))
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
cancer=load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,random_state=0)
scaler=MinMaxScaler().fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
svm=SVC()
svm.fit(X_train_scaled,y_train)
print('Test score:{:.3f}'.format(svm.score(X_test_scaled,y_test)))
Test score:0.972
from sklearn.model_selection import GridSearchCV
param_grid={'C':[0.001,0.01,0.1,1,10,100],
           'gamma':[0.001,0.01,0.1,1,101,100]}
grid=GridSearchCV(SVC(),param_grid=param_grid,cv=5)
grid.fit(X_train_scaled,y_train)
print('Best cross_val_validation score(accuracy)):{:.3f}'.format(grid.best_score_))
print('Best set accuracy :{:.3f}'.format(grid.score(X_test_scaled,y_test)))
print('Best parameters:',grid.best_params_)
Best cross_val_validation score(accuracy)):0.981
Best set accuracy :0.972
Best parameters: {'C': 1, 'gamma': 1}
mglearn.plots.plot_improper_processing()

png

from sklearn.pipeline import Pipeline
pipe=Pipeline([('scaler',MinMaxScaler()),('svm',SVC())])
pipe.fit(X_train,y_train)
Pipeline(steps=[('scaler', MinMaxScaler()), ('svm', SVC())])
print('Test score:{:.2f}'.format(pipe.score(X_test,y_test)))
Test score:0.97
param_grid={'svm__C':[0.001,0.01,0.1,1,10,100],'svm__gamma':[0.001,0.01,0.1,1,10,100]}
grid=GridSearchCV(pipe,param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)
print('Best cross_validation accuracy:{:.2f}'.format(grid.best_score_))
print('Test set score:{:.2f}'.format(grid.score(X_test,y_test)))
print('Best parameters:{}'.format(grid.best_params_))
Best cross_validation accuracy:0.98
Test set score:0.97
Best parameters:{'svm__C': 1, 'svm__gamma': 1}
mglearn.plots.plot_proper_processing()

png

def fit(self,X,y):
    X_transformed=X
    for name,estimator in self.steps[:-1]:
        X_transformed=estimator.fit_transform(X_transformed,y)
    self.steps[:-1][1].fit(X_tranformed,y)
    return self
def predict(self,X):
    X_transformede=X
    for step in self.steps[:-1]:
        X_transformed=step[1].transform(X_transformed)
    return self.steps[-1][1].predict(X_transformed)
from sklearn.pipeline import make_pipeline
pipe_log=Pipeline([('scaler',MinMaxScaler()),('svm',SVC(C=100))])
pipe_short=make_pipeline(MinMaxScaler(),SVC(C=100))
print('Pipeline steps:\n{}'.format(pipe_short.steps))
Pipeline steps:
[('minmaxscaler', MinMaxScaler()), ('svc', SVC(C=100))]
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
pipe=make_pipeline(StandardScaler(),PCA(n_components=2),StandardScaler())
print('Pipeline steps:\n{}'.format(pipe.steps))
Pipeline steps:
[('standardscaler-1', StandardScaler()), ('pca', PCA(n_components=2)), ('standardscaler-2', StandardScaler())]
pipe.fit(cancer.data)
components=pipe.named_steps['pca'].components_
print('components.shape{}'.format(components.shape))
components.shape(2, 30)
from sklearn.linear_model import LogisticRegression
pipe=make_pipeline(StandardScaler(),LogisticRegression())
param_grid={'logisticregression__C':[0.01,0.1,1,10,100]}
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,random_state=4)
grid=GridSearchCV(estimator=pipe,param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(





GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression())]),
             param_grid={'logisticregression__C': [0.01, 0.1, 1, 10, 100]})
print('Best estimator:\n{}'.format(grid.best_estimator_))
Best estimator:
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=1))])
print('Logistic regression step:\n{}'.format(grid.best_estimator_.named_steps['logisticregression']))
Logistic regression step:
LogisticRegression(C=1)
print('Logistic regression cofficients:\n{}'.format(grid.best_estimator_.named_steps['logisticregression'].coef_))
Logistic regression cofficients:
[[-0.43570655 -0.34266946 -0.40809443 -0.5344574  -0.14971847  0.61034122
  -0.72634347 -0.78538827  0.03886087  0.27497198 -1.29780109  0.04926005
  -0.67336941 -0.93447426 -0.13939555  0.45032641 -0.13009864 -0.10144273
   0.43432027  0.71596578 -1.09068862 -1.09463976 -0.85183755 -1.06406198
  -0.74316099  0.07252425 -0.82323903 -0.65321239 -0.64379499 -0.42026013]]
from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge
boston=load_boston()
X_train,X_test,y_train,y_test=train_test_split(boston.data,boston.target,random_state=0)
from sklearn.preprocessing import PolynomialFeatures
pipe=make_pipeline(StandardScaler(),PolynomialFeatures(),Ridge())
param_grid={'polynomialfeatures__degree':[1,2,3],'ridge__alpha':[0.001,0.01,0.1,1,10,100]}
grid=GridSearchCV(estimator=pipe,param_grid=param_grid,cv=5,n_jobs=-1)
grid.fit(X_train,y_train)
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('polynomialfeatures',
                                        PolynomialFeatures()),
                                       ('ridge', Ridge())]),
             n_jobs=-1,
             param_grid={'polynomialfeatures__degree': [1, 2, 3],
                         'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]})
plt.matshow(grid.cv_results_['mean_test_score'].reshape(3,-1),vmin=0,cmap='viridis')
plt.xlabel('ridge__alpha')
plt.ylabel('polynomialfeatures__degree')
plt.xticks(range(len(param_grid['ridge__alpha'])),param_grid['ridge__alpha'])
plt.yticks(range(len(param_grid['polynomialfeatures__degree'])),param_grid['polynomialfeatures__degree'])
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x2ea92bbd7f0>

png

print('Best parameters:{}'.format(grid.best_params_))
Best parameters:{'polynomialfeatures__degree': 2, 'ridge__alpha': 10}
print('Test-set score:{:.2f}'.format(grid.score(X_test,y_test)))
Test-set score:0.77
param_grid={'ridge__alpha':[0.001,0.01,0.1,1,10,100]}
pipe=make_pipeline(StandardScaler(),Ridge())
grid=GridSearchCV(pipe,param_grid,cv=5)
grid.fit(X_train,y_train)
print('Score without poly features:{:.2f}'.format(grid.score(X_test,y_test)))
Score without poly features:0.63
pipe=Pipeline([('preprocessing',StandardScaler()),('classifier',SVC())])
from sklearn.ensemble import RandomForestClassifier
param_grid=[{'classifier':[SVC()],'preprocessing':[StandardScaler(),None],
            'classifier__gamma':[0.001,0.01,0.1,1,10,100],
            'classifier__C':[0.001,0.01,0.1,1,10,100]},
           {'classifier':[RandomForestClassifier(n_estimators=100)],
           'preprocessing':[None],'classifier__max_features':[1,2,3]}]
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,random_state=0)
grid=GridSearchCV(pipe,param_grid,cv=5)
grid.fit(X_train,y_train)
print('Best params:\n{}\n'.format(grid.best_params_))
print('Best cross_validation score:{:.2f}'.format(grid.best_score_))
print('Test set score:{:.2f}'.format(grid.score(X_test,y_test)))
Best params:
{'classifier': SVC(C=10, gamma=0.01), 'classifier__C': 10, 'classifier__gamma': 0.01, 'preprocessing': StandardScaler()}

Best cross_validation score:0.99
Test set score:0.98
%%time
from sklearn.datasets import load_files
reviews_train=load_files(r'./aclImdb/train/')
text_train,y_train=reviews_train.data[:25000],reviews_train.target[reviews_train.target!=2]
print('type of text_trian:{}'.format(type(text_train)))
print('length of text text_train:{}'.format(len(text_train)))
print('text_train[1]:\n{}'.format(text_train[1]))
type of text_trian:<class 'list'>
length of text text_train:25000
text_train[1]:
b"Amount of disappointment I am getting these days seeing movies like Partner, Jhoom Barabar and now, Heyy Babyy is gonna end my habit of seeing first day shows.<br /><br />The movie is an utter disappointment because it had the potential to become a laugh riot only if the d\xc3\xa9butant director, Sajid Khan hadn't tried too many things. Only saving grace in the movie were the last thirty minutes, which were seriously funny elsewhere the movie fails miserably. First half was desperately been tried to look funny but wasn't. Next 45 minutes were emotional and looked totally artificial and illogical.<br /><br />OK, when you are out for a movie like this you don't expect much logic but all the flaws tend to appear when you don't enjoy the movie and thats the case with Heyy Babyy. Acting is good but thats not enough to keep one interested.<br /><br />For the positives, you can take hot actresses, last 30 minutes, some comic scenes, good acting by the lead cast and the baby. Only problem is that these things do not come together properly to make a good movie.<br /><br />Anyways, I read somewhere that It isn't a copy of Three men and a baby but I think it would have been better if it was."
Wall time: 19.4 s
text_train=[doc.replace(b"<br />",b" ")for doc in text_train]
print('sample per class(training):{}'.format(np.bincount(y_train)))
sample per class(training):[12500 12500]
reviews_test=load_files(r'./aclImdb/test/')
text_test,y_test=reviews_test.data,reviews_test.target
print('Number of documents in test data:{}'.format(len(text_test)))
print('Samples per class (test):{}'.format(np.bincount(y_test)))
text_test=[doc.replace(b"<br />",b" ")for doc in text_test]
Number of documents in test data:25000
Samples per class (test):[12500 12500]
bards_words=['The fool doth think he is wise','but the wise man knows himself to be a fool']
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer()
vect.fit(bards_words)
CountVectorizer()
print('Vocabulary size:{}'.format(len(vect.vocabulary_)))
print('Vocabulary content:\n{}'.format(vect.vocabulary_))
Vocabulary size:13
Vocabulary content:
{'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}
bag_of_words=vect.transform(bards_words)
print('bag_of_words:{}'.format(repr(bag_of_words)))
print(bag_of_words)
bag_of_words:<2x13 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 6)	1
  (0, 9)	1
  (0, 10)	1
  (0, 12)	1
  (1, 0)	1
  (1, 1)	1
  (1, 3)	1
  (1, 5)	1
  (1, 7)	1
  (1, 8)	1
  (1, 9)	1
  (1, 11)	1
  (1, 12)	1
print('Dense representation of bag_of_words:\n{}'.format(bag_of_words.toarray()))
Dense representation of bag_of_words:
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]
vect=CountVectorizer().fit(text_test)
X_test=vect.transform(text_test)
print('X_test:\n{}'.format(repr(text_test)))
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)
feature_names=vect.get_feature_names()
print('Number of features :{}'.format(len(feature_names)))
print('First 20 features:\n{}'.format(feature_names[:20]))
print('Features 20010 to 20030:\n{}'.format(feature_names[20010:20030]))
print('Every 2000th feature:\n{}'.format(feature_names[::2000]))
Number of features :73822
First 20 features:
['00', '000', '00000000000', '00000001', '000dm', '001', '0069', '007', '0079', '007s', '0083', '009', '00am', '00o', '00pm', '00s', '00schneider', '01', '0126', '0148']
Features 20010 to 20030:
['drought', 'drove', 'drover', 'droves', 'drowing', 'drown', 'drowned', 'drowning', 'drownings', 'drowns', 'drowsy', 'dru', 'drubbed', 'drubbing', 'drudge', 'drudgery', 'drudges', 'drudging', 'druedain', 'drug']
Every 2000th feature:
['00', 'afflicted', 'ardal', 'basket', 'boiled', 'calculating', 'chitty', 'congealed', 'cushioned', 'devoted', 'droplet', 'envogue', 'felled', 'frontline', 'gorman', 'header', 'hypnotism', 'intruments', 'kerchiefs', 'leos', 'malerie', 'microwaves', 'mutually', 'oedipus', 'pasts', 'polaroids', 'pushtun', 'remnar', 'runner', 'sentencing', 'slips', 'starkly', 'swiches', 'tings', 'ulloa', 'venice', 'widen']
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores=cross_val_score(LogisticRegression(),X_test,y_test,cv=5)
print('Mean cross_validation accuracy:{:.2f}'.format(np.mean(scores)))
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Mean cross_validation accuracy:0.89


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
%%time
from sklearn.model_selection import GridSearchCV
param_grid={'C':[0.001,0.01,0.1,1,10]}
grid=GridSearchCV(LogisticRegression(),param_grid,cv=5,n_jobs=-1)
grid.fit(X_test,y_test)
print('Best cross_valiadtion score:{:.2f}'.format(grid.best_score_))
print('Best parameters:',grid.best_params_)
Best cross_valiadtion score:0.89
Best parameters: {'C': 0.1}
Wall time: 22.9 s


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
vect=CountVectorizer(min_df=5).fit(text_test)
X_test=vect.transform(text_test)
print('X_test with min_df:{}'.format(repr(X_test)))
X_test with min_df:<25000x26749 sparse matrix of type '<class 'numpy.int64'>'
	with 3289577 stored elements in Compressed Sparse Row format>
feature_names=vect.get_feature_names()
print('First 50 features:\n{}'.format(feature_names[:50]))
print('Feature 20010 to 20030:\n{}'.format(feature_names[20010:20030]))
print('Every 700th feature:\n{}'.format(feature_names[::700]))
First 50 features:
['00', '000', '007', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '100', '1000', '100th', '101', '102', '105', '107', '108', '109', '10th', '11', '110', '111', '116', '117', '11th', '12', '120', '125', '12th', '13', '130', '13th', '14', '140', '14th', '15', '150', '1500', '15th', '16', '16mm', '16s', '16th', '17', '170', '1700']
Feature 20010 to 20030:
['riddler', 'riddles', 'ride', 'rider', 'riders', 'rides', 'ridge', 'ridgemont', 'ridges', 'ridicule', 'ridiculed', 'ridicules', 'ridiculous', 'ridiculously', 'ridiculousness', 'riding', 'ridley', 'riefenstahl', 'rife', 'riff']
Every 700th feature:
['00', 'affiliates', 'arbitrary', 'baritone', 'boats', 'caddy', 'childless', 'completist', 'crazily', 'delinquents', 'distinguishable', 'egged', 'exceptionally', 'fink', 'gain', 'grimace', 'hesitate', 'immune', 'invasions', 'king', 'lips', 'marthy', 'mistaking', 'netherworld', 'othello', 'permeate', 'pratfall', 'quip', 'renant', 'rotund', 'selfless', 'skilled', 'spoon', 'succinctly', 'terminator', 'treasures', 'uns', 'warhol', 'yearning']
grid=GridSearchCV(LogisticRegression(),param_grid,cv=5)
grid.fit(X_test,y_test)
print('Best cross_validation score:{:.2f}'.format(grid.best_score_))
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best cross_validation score:0.89


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print('Number of stop words:{}'.format(len(ENGLISH_STOP_WORDS)))
print('Every 10th stopword:\n{}'.format(list(ENGLISH_STOP_WORDS)[::10]))
Number of stop words:318
Every 10th stopword:
['bottom', 'bill', 'indeed', 'except', 'therein', 'made', 'during', 'am', 'whoever', 'via', 'across', 'beforehand', 'after', 'on', 'otherwise', 'be', 'three', 'she', 'without', 'never', 'him', 'most', 'the', 'elsewhere', 'must', 'wherein', 'describe', 'nobody', 'two', 'whole', 'ltd', 'seems']
vect=CountVectorizer(min_df=5,stop_words='english').fit(text_test)
X_test=vect.transform(text_test)
print('X_test with stop words:\n{}'.format(repr(X_test)))
X_test with stop words:
<25000x26445 sparse matrix of type '<class 'numpy.int64'>'
	with 2101629 stored elements in Compressed Sparse Row format>
grid=GridSearchCV(LogisticRegression(),param_grid,cv=5,n_jobs=-1)
grid.fit(X_test,y_test)
print('Best cross_validation score:{:.2f}'.format(grid.best_score_))
Best cross_validation score:0.89


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
pipe=make_pipeline(TfidfVectorizer(min_df=5),LogisticRegression())
param_grid={'logisticregression__C':[0.001,0.01,0.1,1,10]}
grid=GridSearchCV(pipe,param_grid,cv=5,n_jobs=-1)
grid.fit(text_test,y_test)
print('Best cross_validation score:{:.2f}'.format(grid.best_score_))
Best cross_validation score:0.90


C:\Users\reion\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
vectorizer=grid.best_estimator_.named_steps['tfidfvectorizer']
X_test=vectorizer.transform(text_train)
max_value=X_test.max(axis=0).toarray().ravel()
sorted_by_tfidf=max_value.argsort()
feature_names=np.array(vectorizer.get_feature_names())
print('Features with lowest tfidf:\n{}'.format(feature_names[sorted_by_tfidf[:20]]))
print('Features with highest tfidf :\n{}'.format(feature_names[sorted_by_tfidf[-20:]]))
Features with lowest tfidf:
['exorbitant' 'dillane' 'strangeland' 'dicamillo' 'strother' 'desecrate'
 'stuey' 'dengler' 'demeter' 'stéphanois' 'subjugation' 'delair' 'dekalog'
 'deepa' 'davidians' 'daulton' 'darwyn' 'darvi' 'darkhunters'
 'summerville']
Features with highest tfidf :
['autism' 'botched' 'mrs' 'didn' 'godzilla' 'dev' 'lupin' 'rocky'
 'amityville' 'weller' 'bye' 'bambi' 'frasier' 'pokemon' 'othello' 'yadda'
 'lucy' 'ha' 'click' 'wicked']
sorted_by_idf=np.argsort(vectorizer.idf_)
print('Features with lowest idf:\n{}'.format(feature_names[sorted_by_idf[:100]]))
Features with lowest idf:
['the' 'and' 'of' 'to' 'this' 'is' 'it' 'in' 'that' 'but' 'for' 'with'
 'was' 'as' 'on' 'movie' 'not' 'one' 'have' 'be' 'film' 'you' 'are' 'all'
 'at' 'an' 'by' 'so' 'from' 'like' 'who' 'if' 'they' 'there' 'out' 'his'
 'just' 'or' 'about' 'he' 'what' 'has' 'some' 'can' 'good' 'when' 'more'
 'up' 'time' 'very' 'even' 'see' 'only' 'my' 'would' 'no' 'well' 'really'
 'which' 'me' 'story' 'had' 'much' 'their' 'than' 'other' 'were' 'get'
 'do' 'been' 'don' 'most' 'also' 'how' 'great' 'into' 'will' 'first'
 'because' 'make' 'her' 'people' 'made' 'bad' 'way' 'could' 'them' 'we'
 'any' 'after' 'then' 'too' 'watch' 'movies' 'seen' 'acting' 'think' 'she'
 'characters' 'many']
mglearn.tools.visualize_coefficients(
grid.best_estimator_.named_steps['logisticregression'].coef_,feature_names,n_top_features=40)

png

%%time
from sklearn.model_selection import GridSearchCV
# from tune_sklearn import TuneSearchCV
pipe=make_pipeline(TfidfVectorizer(min_df=5),LogisticRegression())
param_grid={'logisticregression__C':[0.001,0.01,0.1,1,10,100],'tfidfvectorizer__ngram_range':[(1,1),(1,2),(1,3)]}
grid=GridSearchCV(pipe,param_grid=param_grid,n_jobs=-1)
grid.fit(text_test,y_test)
print('Best cross_validation score:{:.2f}'.format(grid.best_score_))
print('Best parameters:\n{}'.format(grid.best_params_))
Best cross_validation score:0.91
Best parameters:
{'logisticregression__C': 10, 'tfidfvectorizer__ngram_range': (1, 3)}
Wall time: 3min 38s
scores=grid.cv_results_['mean_test_score'].reshape(-1,3).T
heatmap=mglearn.tools.heatmap(scores,xlabel='C',ylabel='ngram_range',cmap='viridis',
                              fmt='%.3f',
                              xticklabels=param_grid['logisticregression__C'],
                              yticklabels=param_grid['tfidfvectorizer__ngram_range'])
plt.colorbar(heatmap)
<matplotlib.colorbar.Colorbar at 0x2ea84da4f40>

png

vect=grid.best_estimator_.named_steps['tfidfvectorizer']
feature_names=np.array(vect.get_feature_names())
coef=grid.best_estimator_.named_steps['logisticregression'].coef_
mglearn.tools.visualize_coefficients(coef,feature_names,n_top_features=40)

png

mask=np.array([len(feature.split(' '))for feature in feature_names])==3
mglearn.tools.visualize_coefficients(coef.ravel()[mask],feature_names[mask],n_top_features=40)

png

import spacy
import nltk
en_nlp=spacy.load('en_core_web_sm')
stemmer=nltk.stem.PorterStemmer()
def compare_normalization(doc):
    doc_spacy=en_nlp(doc)
    print('Lemmatization')
    print([token.lemma_ for token in doc_spacy])
    print('Stemming:')
    print([stemmer.stem(token.norm_.lower())for token in doc_spacy])
compare_normalization(u"Our meeting today was worse than yesterday,""I'm scared of meeting the clients tomorrow.")
Lemmatization
['our', 'meeting', 'today', 'be', 'bad', 'than', 'yesterday', ',', "i'm", 'scare', 'of', 'meet', 'the', 'client', 'tomorrow', '.']
Stemming:
['our', 'meet', 'today', 'wa', 'wors', 'than', 'yesterday', ',', "i'm", 'scare', 'of', 'meet', 'the', 'client', 'tomorrow', '.']
import re
regexp=re.compile('(?u)\\b\\w\\w+\\b')
en_nlp=spacy.load('en_core_web_sm')
old_tokenizer=en_nlp.tokenizer
en_nlp.tokenizer=lambda string: old_tokenizer.tokens_from_list(regexp.findall(string))
def custom_tokenizer(document):
    doc_spacy=en_nlp(document,entity=False,parse=False)
    return [token.lemma_ for token in doc_spacy]
lemma_vect=CountVectorizer(tokenizer=custom_tokenizer,min_df=5)
X_test_lemma=lemma_vect.fit_transform(text_test)
print('X_test_lemma.shape{}'.format(X_test_lemma.shape))
'''.tokens_from_list报错,暂时先跳过'''
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-404-5d08729e20e8> in <module>
----> 1 X_test_lemma=lemma_vect.fit_transform(text_test)
      2 print('X_test_lemma.shape{}'.format(X_test_lemma.shape))
      3 '''.tokens_from_list报错,暂时先跳过'''
      4 


~\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
   1200         max_features = self.max_features
   1201 
-> 1202         vocabulary, X = self._count_vocab(raw_documents,
   1203                                           self.fixed_vocabulary_)
   1204 


~\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
   1112         for doc in raw_documents:
   1113             feature_counter = {}
-> 1114             for feature in analyze(doc):
   1115                 try:
   1116                     feature_idx = vocabulary[feature]


~\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
    104             doc = preprocessor(doc)
    105         if tokenizer is not None:
--> 106             doc = tokenizer(doc)
    107         if ngrams is not None:
    108             if stop_words is not None:


<ipython-input-403-79450082e040> in custom_tokenizer(document)
      5 en_nlp.tokenizer=lambda string: old_tokenizer.tokens_from_list(regexp.findall(string))
      6 def custom_tokenizer(document):
----> 7     doc_spacy=en_nlp(document,entity=False,parse=False)
      8     return [token.lemma_ for token in doc_spacy]
      9 lemma_vect=CountVectorizer(tokenizer=custom_tokenizer,min_df=5)


TypeError: __call__() got an unexpected keyword argument 'entity'
vect=CountVectorizer(max_features=10000,max_df=.15)
X=vect.fit_transform(text_test)
from sklearn.decomposition import LatentDirichletAllocation
lda=LatentDirichletAllocation(n_components=10,learning_method='batch',max_iter=25,random_state=0)
document_topics=lda.fit_transform(X)
lda.components_.shape
(10, 10000)
sorting=np.argsort(lda.components_,axis=1)[:,::-1]
feature_names=np.array(vect.get_feature_names())
mglearn.tools.print_topics(topics=range(10),feature_names=feature_names,sorting=sorting,topics_per_chunk=5,
                          n_words=10)
topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
us            didn          role          war           action        
world         nothing       performance   american      role          
family        worst         girl          world         cast          
own           actually      cast          us            murder        
each          re            played        our           robert        
between       minutes       actress       history       plays         
real          going         young         german        performance   
human         actors        book          years         john          
young         guy           plays         country       actor         
without       pretty        women         soldiers      director      


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
car           horror        show          music         sex           
back          effects       funny         dvd           italian       
gets          monster       comedy        work          violence      
guy           fi            series        version       performance   
old           sci           tv            actors        dr            
down          space         episode       director      director      
big           special       now           original      quite         
town          evil          saw           game          gore          
kids          alien         years         years         woman         
around        series        watched       sound         shakespeare   
lda100=LatentDirichletAllocation(n_components=100,learning_method='batch',max_iter=25,random_state=0)
document_topics100=lda100.fit_transform(X)
topics=np.array([7,16,24,25,28,36,37,45,51,53,54,63,89,97])
sorting=np.argsort(lda100.components_,axis=1)[:,::-1]
feature_names=np.array(vect.get_feature_names())
mglearn.tools.print_topics(topics=topics,feature_names=feature_names,sorting=sorting,topics_per_chunk=7,n_words=20)
topic 7       topic 16      topic 24      topic 25      topic 28      topic 36      topic 37      
--------      --------      --------      --------      --------      --------      --------      
robin         noir          box           british       game          years         jackson       
doctor        robert        darkness      london        games         wonderful     michael       
de            spanish       heart         disaster      play          old           black         
murphy        murder        button        tv            jason         again         tale          
williams      young         rose          water         played        loved         blues         
hood          racism        mary          mr            lloyd         saw           videos        
jackie        detective     conrad        vegas         oscar         excellent     samuel        
tv            anti          gulliver      old           award         today         music         
eddie         crime         diaz          jordan        hoffman       now           justin        
part          ryan          kurtz         local         matthau       beautiful     ricci         
showtime      carmen        interesting   uk            billy         every         abuse         
niro          police        massacre      hamilton      academy       since         thriller      
role          picture       find          las           graphics      amazing       christina     
got           mitchum       marlow        day           won           ago           daughters     
russo         drama         party         minister      kramer        10            generation    
now           crossfire     travels       crocodile     ted           disney        south         
played        femme         joseph        tree          golf          perfect       us            
actor         spain         production    look          playing       recommend     vincent       
robert        against       however       cinema        greatest      favorite      nanny         
old           fine          based         set           marty         always        plays         


topic 45      topic 51      topic 53      topic 54      topic 63      topic 89      topic 97      
--------      --------      --------      --------      --------      --------      --------      
death         wax           reality       kids          alien         sex           christmas     
wife          torture       mark          children      vampire       women         books         
human         trap          kevin         school        space         men           queen         
own           macbeth       real          kid           aliens        female        am            
both          tourist       allen         parents       earth         woman         mini          
self          terrorist     jennifer      year          effects       male          novels        
hospital      rendition     caine         old           vampires      sexual        stanwyck      
yet           terrorists    virtual       child         fi            porn          barbara       
now           desert        woody         adults        sci           nudity        morgan        
us            director      michael       young         ship          sexy          lestat        
himself       cia           spacey        high          predator      naked         vice          
without       museum        mentally      age           planet        girl          damned        
becomes       streep        cronenberg    family        special       soft          magazine      
seems         andreas       sense         adult         hero          girls         snow          
however       el            away          show          star          core          read          
left          government    fiction       college       action        lesbian       fanny         
our           another       human         barney        human         looking       portray       
lives         meryl         makes         fun           evil          having        cook          
between       mannequins    challenged    girl          our           erotic        completely    
emotional     arab          dream         new           crew          beautiful     listening     
death=np.argsort(document_topics100[:,45])[::-1]
for i in death[:10]:
    print(b'.'.join(text_test[i].split(b'.')[:2])+b'.\n')
b'Francis Ford Coppola\'s "Apocalypse Now" is not a Vietnam War film. Do not confuse it with one.\n'
b"Dead Man Walking, absolutely brilliant, in tears by the end! You can not watch this film and not think about the issues it raises; how can you justify killing (whether it be murder or the death penalty) and to what point is forgiveness possible (not just in a spiritual way). Don't watch this film when your down! But WATCH IT!!!.\n"
b'The Hospital is a movie that was made ahead of its time. This film, produced by screenwriter Paddy Chayefsky, who gave us the Oscar-Winning film, "Network", deals with overworked staff, gross incompetence, and bureaucratic corruption at a large conglomerate hospital in Manhattan.\n'
b"What a self-indulgent mess! Duncan Roy's film is apparently autobiographical, however it's impossible to find any glimmer of emotional truth in this chaotic, badly acted and woefully amateurish fiasco.  In a way, you have to admire the balls of a man who through grim determination and a very generous benefactor manages to make a film about his own rise and fall - from abused, working class lad to criminal English lord.\n"
b"I've been intending to write a review of this film for some time, but only now have I actually managed to get my thoughts down for the perusal of others.  I never had the pleasure of seeing this film on the `big screen' which is a shame, as it is often visually stunning, but I have revisited it on video numerous times over the years, enjoying it immensely every time.\n"
b'Spoilers. This review has been edited due to word limit.\n'
b"Although the acting was excellent, I spent the whole movie waiting for the nasty boy who caused so much grief to so many of the characters, get his final nemesis, and instead everyone else suffered except him and he gets the job of the husband whose wife's death he causes by running away from his friends, wins the girl he gave an overdose to and tried to rape. Even his friend gives money to his father, but the butterfly effect completely fails to return to its cause.\n"
b'Chen Kaige lost his sense of tempo. I envy Europeans and Americans who can watch the film without following the dialog with their ears, because it is painful to do: slow, unnaturally heavy, and over-deliberate.\n'
b"Francis Ford Coppola's first 'personal' film, completed and released in 1969, was the last movie he made as a mostly unknown, up and coming director before The Godfather, and is in stark contrast to both that film, and the rest of his uneven career. It's ostensibly a road movie involving a disconnected young woman bored with domestic life, and pregnant with a child she isn't sure she wants, fleeing the trappings her dull marriage and hitting the open road in search of freedom.\n"
b"I don't remember too much about this movie except that there was a distinctly gratuitous destruction of luminaires (lamps). Almost every fight scene included the unnecessary and wanton destruction of useful light fixtures, even if outfitted with cheesy, '70's-style, cylindrical shades to keep with the time setting of the story.\n"
fig,ax=plt.subplots(1,2,figsize=(10,10))
topic_names=["{:>2}".format(i)+''.join(words)for i ,words in enumerate(feature_names[sorting[:,:2]])]
for col in [0,1]:
    start=col*50
    end=(col+1)*50
    ax[col].barh(np.arange(50),np.sum(document_topics100,axis=0)[start:end])
    ax[col].set_yticks(np.arange(50))
    ax[col].set_yticklabels(topic_names[start:end],ha='left',va='top')
    ax[col].invert_yaxis()
    ax[col].set_xlim(0,2000)
    yax=ax[col].get_yaxis()
    yax.set_tick_params(pad=130)
plt.tight_layout()

png

posted @ 2021-09-19 09:43  里列昂遗失的记事本  阅读(349)  评论(0编辑  收藏  举报