博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

06SVM线性不可分时scikit-learn实现

Posted on 2019-04-07 10:04  心默默言  阅读(455)  评论(0编辑  收藏  举报

https://blog.csdn.net/weixin_40123108/article/details/84378202

 

In [5]:
from time import time
import logging  #程序进展信息
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split  #分割数据集
# from sklearn.cross_validation import train_test_split
from sklearn.datasets import fetch_lfw_people  #下载数据集
from sklearn.model_selection import GridSearchCV
# from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# from sklearn.decomposition import RandomizedPCA
from sklearn.decomposition import PCA
from sklearn.svm import SVC
In [7]:
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
In [8]:
print(lfw_people)
 
{'data': array([[254.33333 , 254.      , 252.      , ...,  87.      ,  88.666664,
         87.      ],
       [ 39.      ,  50.666668,  47.      , ..., 117.666664, 115.      ,
        133.      ],
       [ 89.666664, 103.666664, 126.333336, ..., 175.33333 , 183.33333 ,
        182.66667 ],
       ...,
       [ 86.666664,  80.      ,  74.333336, ...,  44.333332,  50.      ,
         44.666668],
       [ 50.666668,  65.333336,  88.333336, ..., 196.66667 , 178.66667 ,
        165.66667 ],
       [ 30.      ,  27.      ,  33.      , ...,  35.      ,  35.666668,
         61.      ]], dtype=float32), 'images': array([[[254.33333 , 254.      , 252.      , ...,  65.666664,
          51.333332,  40.333332],
        [253.33333 , 251.66667 , 247.66667 , ...,  66.666664,
          52.      ,  42.333332],
        [240.66667 , 231.33333 , 211.66667 , ...,  61.333332,
          49.      ,  41.666668],
        ...,
        [ 74.333336,  53.666668,  31.333334, ...,  97.333336,
          92.666664,  90.      ],
        [ 66.      ,  47.333332,  30.333334, ...,  91.666664,
          92.      ,  86.333336],
        [ 59.      ,  44.666668,  32.333332, ...,  87.      ,
          88.666664,  87.      ]],

       [[ 39.      ,  50.666668,  47.      , ...,  61.333332,
          51.      ,  38.666668],
        [ 47.666668,  63.      ,  65.      , ...,  58.333332,
          55.333332,  45.      ],
        [ 56.      ,  76.      ,  86.333336, ...,  71.      ,
          48.666668,  43.666668],
        ...,
        [ 73.666664,  75.333336,  75.333336, ..., 125.666664,
         119.666664, 116.      ],
        [ 75.333336,  75.666664,  76.666664, ..., 124.333336,
         116.      , 116.666664],
        [ 77.333336,  76.      ,  75.666664, ..., 117.666664,
         115.      , 133.      ]],

       [[ 89.666664, 103.666664, 126.333336, ..., 149.66667 ,
         150.66667 , 148.33333 ],
        [100.666664, 128.33333 , 143.66667 , ..., 159.      ,
         151.33333 , 147.33333 ],
        [124.      , 142.66667 , 146.33333 , ..., 161.33333 ,
         152.33333 , 147.33333 ],
        ...,
        [ 75.666664,  73.666664,  74.      , ..., 123.      ,
         168.33333 , 178.66667 ],
        [ 73.333336,  68.666664,  69.666664, ..., 150.      ,
         182.      , 181.66667 ],
        [ 79.      ,  66.      ,  66.333336, ..., 175.33333 ,
         183.33333 , 182.66667 ]],

       ...,

       [[ 86.666664,  80.      ,  74.333336, ...,  35.      ,
          35.      ,  39.666668],
        [ 85.      ,  75.      ,  79.666664, ...,  37.      ,
          35.      ,  37.      ],
        [ 71.666664,  65.666664,  94.666664, ...,  41.      ,
          37.      ,  36.333332],
        ...,
        [ 92.333336,  88.333336,  87.333336, ...,  66.      ,
          79.333336,  94.      ],
        [ 86.666664,  86.      ,  88.666664, ...,  46.333332,
          58.666668,  64.333336],
        [ 77.      ,  78.666664,  81.666664, ...,  44.333332,
          50.      ,  44.666668]],

       [[ 50.666668,  65.333336,  88.333336, ..., 159.      ,
         158.66667 , 152.      ],
        [ 60.      ,  83.      ,  99.333336, ..., 157.33333 ,
         150.66667 , 149.66667 ],
        [ 62.      ,  90.666664,  94.333336, ..., 157.33333 ,
         145.      , 143.66667 ],
        ...,
        [ 60.333332,  61.      ,  62.      , ..., 151.33333 ,
         167.      , 164.33333 ],
        [ 61.      ,  61.666668,  62.666668, ..., 187.33333 ,
         176.33333 , 167.      ],
        [ 62.      ,  61.      ,  62.666668, ..., 196.66667 ,
         178.66667 , 165.66667 ]],

       [[ 30.      ,  27.      ,  33.      , ...,  90.      ,
          53.333332,  46.      ],
        [ 31.333334,  31.666666,  37.333332, ..., 104.333336,
          56.333332,  42.666668],
        [ 33.666668,  33.666668,  39.      , ..., 123.333336,
          71.333336,  52.333332],
        ...,
        [ 45.666668,  44.      ,  43.666668, ...,  23.      ,
          20.333334,  34.333332],
        [ 42.333332,  42.      ,  45.      , ...,  23.666666,
          27.      ,  44.      ],
        [ 45.333332,  49.666668,  51.333332, ...,  35.      ,
          35.666668,  61.      ]]], dtype=float32), 'target': array([5, 6, 3, ..., 5, 3, 5], dtype=int64), 'target_names': array(['Ariel Sharon', 'Colin Powell', 'Donald Rumsfeld', 'George W Bush',
       'Gerhard Schroeder', 'Hugo Chavez', 'Tony Blair'], dtype='<U17'), 'DESCR': ".. _labeled_faces_in_the_wild_dataset:\n\nThe Labeled Faces in the Wild face recognition dataset\n------------------------------------------------------\n\nThis dataset is a collection of JPEG pictures of famous people collected\nover the internet, all details are available on the official website:\n\n    http://vis-www.cs.umass.edu/lfw/\n\nEach picture is centered on a single face. The typical task is called\nFace Verification: given a pair of two pictures, a binary classifier\nmust predict whether the two images are from the same person.\n\nAn alternative task, Face Recognition or Face Identification is:\ngiven the picture of the face of an unknown person, identify the name\nof the person by referring to a gallery of previously seen pictures of\nidentified persons.\n\nBoth Face Verification and Face Recognition are tasks that are typically\nperformed on the output of a model trained to perform Face Detection. The\nmost popular model for Face Detection is called Viola-Jones and is\nimplemented in the OpenCV library. The LFW faces were extracted by this\nface detector from various online websites.\n\n**Data Set Characteristics:**\n\n    =================   =======================\n    Classes                                5749\n    Samples total                         13233\n    Dimensionality                         5828\n    Features            real, between 0 and 255\n    =================   =======================\n\nUsage\n~~~~~\n\n``scikit-learn`` provides two loaders that will automatically download,\ncache, parse the metadata files, decode the jpeg and convert the\ninteresting slices into memmapped numpy arrays. This dataset size is more\nthan 200 MB. The first load typically takes more than a couple of minutes\nto fully decode the relevant part of the JPEG files into numpy arrays. If\nthe dataset has  been loaded once, the following times the loading times\nless than 200ms by using a memmapped version memoized on the disk in the\n``~/scikit_learn_data/lfw_home/`` folder using ``joblib``.\n\nThe first loader is used for the Face Identification task: a multi-class\nclassification task (hence supervised learning)::\n\n  >>> from sklearn.datasets import fetch_lfw_people\n  >>> lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)\n\n  >>> for name in lfw_people.target_names:\n  ...     print(name)\n  ...\n  Ariel Sharon\n  Colin Powell\n  Donald Rumsfeld\n  George W Bush\n  Gerhard Schroeder\n  Hugo Chavez\n  Tony Blair\n\nThe default slice is a rectangular shape around the face, removing\nmost of the background::\n\n  >>> lfw_people.data.dtype\n  dtype('float32')\n\n  >>> lfw_people.data.shape\n  (1288, 1850)\n\n  >>> lfw_people.images.shape\n  (1288, 50, 37)\n\nEach of the ``1140`` faces is assigned to a single person id in the ``target``\narray::\n\n  >>> lfw_people.target.shape\n  (1288,)\n\n  >>> list(lfw_people.target[:10])\n  [5, 6, 3, 1, 0, 1, 3, 4, 3, 0]\n\nThe second loader is typically used for the face verification task: each sample\nis a pair of two picture belonging or not to the same person::\n\n  >>> from sklearn.datasets import fetch_lfw_pairs\n  >>> lfw_pairs_train = fetch_lfw_pairs(subset='train')\n\n  >>> list(lfw_pairs_train.target_names)\n  ['Different persons', 'Same person']\n\n  >>> lfw_pairs_train.pairs.shape\n  (2200, 2, 62, 47)\n\n  >>> lfw_pairs_train.data.shape\n  (2200, 5828)\n\n  >>> lfw_pairs_train.target.shape\n  (2200,)\n\nBoth for the :func:`sklearn.datasets.fetch_lfw_people` and\n:func:`sklearn.datasets.fetch_lfw_pairs` function it is\npossible to get an additional dimension with the RGB color channels by\npassing ``color=True``, in that case the shape will be\n``(2200, 2, 62, 47, 3)``.\n\nThe :func:`sklearn.datasets.fetch_lfw_pairs` datasets is subdivided into\n3 subsets: the development ``train`` set, the development ``test`` set and\nan evaluation ``10_folds`` set meant to compute performance metrics using a\n10-folds cross validation scheme.\n\n.. topic:: References:\n\n * `Labeled Faces in the Wild: A Database for Studying Face Recognition\n   in Unconstrained Environments.\n   <http://vis-www.cs.umass.edu/lfw/lfw.pdf>`_\n   Gary B. Huang, Manu Ramesh, Tamara Berg, and Erik Learned-Miller.\n   University of Massachusetts, Amherst, Technical Report 07-49, October, 2007.\n\n\nExamples\n~~~~~~~~\n\n:ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py`\n"}
In [9]:
# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
n_samples, h, w = lfw_people.images.shape  # 图像矩阵的行h,列w
print(n_samples, h, w)  # 1288 50 37
 
1288 50 37
In [12]:
X = lfw_people.data  #图片数据
X.shape
Out[12]:
(1288, 1850)
In [13]:
n_features = X.shape[1]  # 矩阵列数特征点数据1850
n_features
Out[13]:
1850
In [14]:
# the label to predict is the id of the person
y = lfw_people.target  # y是label,目标代号0,1,2,3,......
y
Out[14]:
array([5, 6, 3, ..., 5, 3, 5], dtype=int64)
In [16]:
target_names = lfw_people.target_names  # 实际有哪些名字,这个是一个字符串
target_names
Out[16]:
array(['Ariel Sharon', 'Colin Powell', 'Donald Rumsfeld', 'George W Bush',
       'Gerhard Schroeder', 'Hugo Chavez', 'Tony Blair'], dtype='<U17')
In [18]:
n_classes = target_names.shape[0]  #shape[0]--行维数 shape[1]--列维数
n_classes
Out[18]:
7
In [19]:
print("Total dataset size:")
print("n_samples: %d\nn_features: %d\nn_classes: %d" % (n_samples, n_features, n_classes))
 
Total dataset size:
n_samples: 1288
n_features: 1850
n_classes: 7
In [20]:
# Split into a training set and a test set using a stratified k fold
# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
 
(966, 1850) (966,) (322, 1850) (322,)
In [21]:
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150

print("Extracting the top %d eigenfaces from %d faces"
      % (n_components, X_train.shape[0]))
 
Extracting the top 150 eigenfaces from 966 faces
In [22]:
t0 = time()
# pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
pca = PCA(svd_solver='randomized', n_components=n_components, whiten=True)  #降低维度,提取特征,(此处特征值位度较高)
pca.fit(X, y)  # 训练如何降维
print("done in %0.3fs" % (time() - t0))
 
done in 3.806s
In [23]:
eigenfaces = pca.components_.reshape((n_components, h, w))  # 三维
# eigenfaces = pca.components_.reshape((n_components, h, w))
In [24]:
print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))
 
Projecting the input data on the eigenfaces orthonormal basis
done in 0.180s
In [35]:
# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 998, 1001, 999, 1002],
              'gamma': [0.0001, 0.003, 0.0035, 0.004, 0.0045], }  # 不停缩小范围
# clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = GridSearchCV(SVC(kernel='rbf', class_weight=None), param_grid)  # GridSearchCV()第一个参数是分类器
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)
 
Fitting the classifier to the training set
 
C:\Users\Administrator\Envs\MachineLearning\lib\site-packages\sklearn\model_selection\_split.py:2053: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.
  warnings.warn(CV_WARNING, FutureWarning)
C:\Users\Administrator\Envs\MachineLearning\lib\site-packages\sklearn\model_selection\_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
 
done in 24.177s
Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.003, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
In [33]:
print("Predicting people's names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))

print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))

# Qualitative evaluation of the predictions using matplotlib


def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    """Helper function to plot a gallery of portraits"""
    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    for i in range(n_row * n_col):
        plt.subplot(n_row, n_col, i + 1)
        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
        plt.title(titles[i], size=12)
        plt.xticks(())
        plt.yticks(())

# plot the result of the prediction on a portion of the test set
 
 
def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
    return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)


prediction_titles = [title(y_pred, y_test, target_names, i)
                     for i in range(y_pred.shape[0])]

plot_gallery(X_test, prediction_titles, h, w)

# plot the gallery of the most significative eigenfaces

eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)

plt.show()
 
Predicting people's names on the test set
done in 0.141s
                   precision    recall  f1-score   support

     Ariel Sharon       0.88      0.79      0.83        19
     Colin Powell       0.79      0.86      0.82        64
  Donald Rumsfeld       0.93      0.62      0.74        42
    George W Bush       0.82      0.93      0.87       123
Gerhard Schroeder       0.87      0.71      0.78        28
      Hugo Chavez       0.71      0.77      0.74        13
       Tony Blair       0.90      0.82      0.86        33

        micro avg       0.83      0.83      0.83       322
        macro avg       0.84      0.79      0.81       322
     weighted avg       0.84      0.83      0.83       322

[[ 15   0   0   3   0   1   0]
 [  0  55   0   9   0   0   0]
 [  1   4  26   6   2   1   2]
 [  0   6   1 115   0   1   0]
 [  0   2   0   4  20   1   1]
 [  1   1   0   1   0  10   0]
 [  0   2   1   2   1   0  27]]
 
 
In [ ]: