from sklearn.decomposition import PCA
from sklearn.datasets import load_iris

pca = PCA(n_components=2, whiten=True)  # 主成分是两个，正则化为True
iris = load_iris()
pca.fit(iris.data)
pca.components_ # 打印主成分

array([[ 0.36138659, -0.08452251,  0.85667061,  0.3582892 ],
       [ 0.65658877,  0.73016143, -0.17337266, -0.07548102]])

pca.explained_variance_ratio_  # 这两个主成分可以多大程度上去表现这个数据

array([0.92461872, 0.05306648])

pca.explained_variance_ratio_.sum()  # 两个主成分一共可以表达数据的程度

0.9776852063187949

x_pca = pca.transform(iris.data)
iris.data.shape, x_pca.shape

((150, 4), (150, 2))

原始数据是150个样本，4个特征；降维之后的数据是150个样本，2个特征。

import matplotlib.pyplot as plt
from itertools import cycle


def plot_PCA_2D(data, target, target_names):
    colors = cycle('rgbcmykw')
    target_ids = range(len(target_names))
    plt.figure()
    for i, c, label in zip(target_ids, colors, target_names):
        plt.scatter(data[target == i, 0], data[target == i, 1],
                    c=c, label=label)

plot_PCA_2D(iris.data, iris.target, iris.target_names)

from sklearn.cluster import KMeans

k_means = KMeans(n_clusters=3)
k_means.fit(x_pca)
k_means.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2,
       2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2,
       1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1])

plot_PCA_2D(x_pca, k_means.labels_, ['c0', 'c1', 'c2'])
plt.title('kmeans_labels')

Text(0.5, 1.0, 'kmeans_labels')

心默默言

公告

03scikit-learn非监督学习