In [1]:
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
pca = PCA(n_components=2, whiten=True) # 主成分是两个,正则化为True
iris = load_iris()
pca.fit(iris.data)
pca.components_ # 打印主成分
Out[1]:
In [2]:
pca.explained_variance_ratio_ # 这两个主成分可以多大程度上去表现这个数据
Out[2]:
In [4]:
pca.explained_variance_ratio_.sum() # 两个主成分一共可以表达数据的程度
Out[4]:
In [5]:
x_pca = pca.transform(iris.data)
iris.data.shape, x_pca.shape
Out[5]:
原始数据是150个样本,4个特征;降维之后的数据是150个样本,2个特征。
In [8]:
import matplotlib.pyplot as plt
from itertools import cycle
def plot_PCA_2D(data, target, target_names):
colors = cycle('rgbcmykw')
target_ids = range(len(target_names))
plt.figure()
for i, c, label in zip(target_ids, colors, target_names):
plt.scatter(data[target == i, 0], data[target == i, 1],
c=c, label=label)
In [9]:
plot_PCA_2D(iris.data, iris.target, iris.target_names)
In [10]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3)
k_means.fit(x_pca)
k_means.labels_
Out[10]:
In [11]:
plot_PCA_2D(x_pca, k_means.labels_, ['c0', 'c1', 'c2'])
plt.title('kmeans_labels')
Out[11]: