PCA降维
读取数据
import pandas as pd import openpyxl import numpy as np data = pd.read_excel("E:\jupyter_root_directory\data/我国大陆经济发展状况数据.xlsx",header=None,engine='openpyxl') data = data [2:] data = data[[1,2,3,4,5,6,7,8]] data
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|
2 | 1394.89 | 2505 | 519.01 | 8144 | 373.9 | 117.3 | 112.6 | 843.43 |
3 | 920.11 | 2720 | 345.46 | 6501 | 342.8 | 115.2 | 110.6 | 582.51 |
4 | 2849.52 | 1258 | 704.87 | 4839 | 2033.3 | 115.2 | 115.8 | 1234.85 |
5 | 1092.48 | 1250 | 290.9 | 4721 | 717.3 | 116.9 | 115.6 | 697.25 |
6 | 832.88 | 1387 | 250.23 | 4134 | 781.7 | 117.5 | 116.8 | 419.39 |
7 | 2793.37 | 2397 | 387.99 | 4911 | 1371.1 | 116.1 | 114 | 1840.55 |
8 | 1129.2 | 1872 | 320.45 | 4430 | 497.4 | 115.2 | 114.2 | 762.47 |
9 | 2014.53 | 2334 | 435.73 | 4145 | 824.8 | 116.1 | 114.3 | 1240.37 |
10 | 2462.57 | 5343 | 996.48 | 9279 | 207.4 | 118.7 | 113 | 1642.95 |
11 | 5155.25 | 1926 | 1434.95 | 5934 | 1025.5 | 115.8 | 114.3 | 2026.64 |
12 | 3524.79 | 2249 | 1006.39 | 6619 | 754.4 | 116.6 | 113.5 | 916.59 |
13 | 2003.58 | 1254 | 474 | 4609 | 908.3 | 114.8 | 112.7 | 824.14 |
14 | 2160.52 | 2320 | 553.97 | 5857 | 609.3 | 115.2 | 114.4 | 433.67 |
15 | 1205.1 | 1182 | 282.84 | 4211 | 411.7 | 116.9 | 115.9 | 571.84 |
16 | 5002.34 | 1527 | 1229.55 | 5145 | 1196.6 | 117.6 | 114.2 | 2207.69 |
17 | 3002.74 | 1034 | 670.35 | 4344 | 1574.4 | 116.5 | 114.9 | 1367.92 |
18 | 2391.42 | 1527 | 571.68 | 4685 | 849 | 120 | 116.6 | 1220.72 |
19 | 2195.7 | 1408 | 422.61 | 4797 | 1011.8 | 119 | 115.5 | 843.83 |
20 | 5381.72 | 2699 | 1639.83 | 8250 | 656.5 | 114 | 111.6 | 1396.35 |
21 | 1606.15 | 1314 | 382.59 | 5150 | 556 | 118.4 | 116.4 | 554.97 |
22 | 364.17 | 1814 | 198.35 | 5340 | 232.1 | 113.5 | 111.3 | 64.33 |
23 | 3534 | 1261 | 822.54 | 4645 | 902.3 | 118.5 | 117 | 1431.81 |
24 | 630.07 | 942 | 150.84 | 4475 | 301.1 | 121.4 | 117.2 | 324.72 |
25 | 1206.68 | 1261 | 334 | 5149 | 310.4 | 121.3 | 118.1 | 716.65 |
26 | 55.98 | 1110 | 17.87 | 7382 | 4.2 | 117.3 | 114.9 | 5.57 |
27 | 1000.03 | 1208 | 300.27 | 4396 | 500.9 | 119 | 117 | 600.98 |
28 | 553.35 | 1007 | 114.81 | 5493 | 507 | 119.8 | 116.5 | 468.79 |
29 | 165.31 | 1445 | 47.76 | 5753 | 61.6 | 118 | 116.3 | 105.8 |
30 | 169.75 | 1355 | 61.98 | 5079 | 121.8 | 117.1 | 115.3 | 114.4 |
31 | 834.57 | 1469 | 376.95 | 5348 | 339 | 119.7 | 116.7 | 428.76 |
去中心化:每个数据减去对应每列的平均值
1 sample,feature=data.shape 2 data = data - np.mean(data) 3 data
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|
2 | -526.202 | 759.067 | 7.50167 | 2685.17 | -292.22 | 0.0133333 | -2.30667 | -19.568 |
3 | -1000.98 | 974.067 | -166.048 | 1042.17 | -323.32 | -2.08667 | -4.30667 | -280.488 |
4 | 928.428 | -487.933 | 193.362 | -619.833 | 1367.18 | -2.08667 | 0.893333 | 371.852 |
5 | -828.612 | -495.933 | -220.608 | -737.833 | 51.18 | -0.386667 | 0.693333 | -165.748 |
6 | -1088.21 | -358.933 | -261.278 | -1324.83 | 115.58 | 0.213333 | 1.89333 | -443.608 |
7 | 872.278 | 651.067 | -123.518 | -547.833 | 704.98 | -1.18667 | -0.906667 | 977.552 |
8 | -791.892 | 126.067 | -191.058 | -1028.83 | -168.72 | -2.08667 | -0.706667 | -100.528 |
9 | 93.4377 | 588.067 | -75.7783 | -1313.83 | 158.68 | -1.18667 | -0.606667 | 377.372 |
10 | 541.478 | 3597.07 | 484.972 | 3820.17 | -458.72 | 1.41333 | -1.90667 | 779.952 |
11 | 3234.16 | 180.067 | 923.442 | 475.167 | 359.38 | -1.48667 | -0.606667 | 1163.64 |
12 | 1603.7 | 503.067 | 494.882 | 1160.17 | 88.28 | -0.686667 | -1.40667 | 53.592 |
13 | 82.4877 | -491.933 | -37.5083 | -849.833 | 242.18 | -2.48667 | -2.20667 | -38.858 |
14 | 239.428 | 574.067 | 42.4617 | 398.167 | -56.82 | -2.08667 | -0.506667 | -429.328 |
15 | -715.992 | -563.933 | -228.668 | -1247.83 | -254.42 | -0.386667 | 0.993333 | -291.158 |
16 | 3081.25 | -218.933 | 718.042 | -313.833 | 530.48 | 0.313333 | -0.706667 | 1344.69 |
17 | 1081.65 | -711.933 | 158.842 | -1114.83 | 908.28 | -0.786667 | -0.00666667 | 504.922 |
18 | 470.328 | -218.933 | 60.1717 | -773.833 | 182.88 | 2.71333 | 1.69333 | 357.722 |
19 | 274.608 | -337.933 | -88.8983 | -661.833 | 345.68 | 1.71333 | 0.593333 | -19.168 |
20 | 3460.63 | 953.067 | 1128.32 | 2791.17 | -9.62 | -3.28667 | -3.30667 | 533.352 |
21 | -314.942 | -431.933 | -128.918 | -308.833 | -110.12 | 1.11333 | 1.49333 | -308.028 |
22 | -1556.92 | 68.0667 | -313.158 | -118.833 | -434.02 | -3.78667 | -3.60667 | -798.668 |
23 | 1612.91 | -484.933 | 311.032 | -813.833 | 236.18 | 1.21333 | 2.09333 | 568.812 |
24 | -1291.02 | -803.933 | -360.668 | -983.833 | -365.02 | 4.11333 | 2.29333 | -538.278 |
25 | -714.412 | -484.933 | -177.508 | -309.833 | -355.72 | 4.01333 | 3.19333 | -146.348 |
26 | -1865.11 | -635.933 | -493.638 | 1923.17 | -661.92 | 0.0133333 | -0.00666667 | -857.428 |
27 | -921.062 | -537.933 | -211.238 | -1062.83 | -165.22 | 1.71333 | 2.09333 | -262.018 |
28 | -1367.74 | -738.933 | -396.698 | 34.1667 | -159.12 | 2.51333 | 1.59333 | -394.208 |
29 | -1755.78 | -300.933 | -463.748 | 294.167 | -604.52 | 0.713333 | 1.39333 | -757.198 |
30 | -1751.34 | -390.933 | -449.528 | -379.833 | -544.32 | -0.186667 | 0.393333 | -748.598 |
31 | -1086.52 | -276.933 | -134.558 | -110.833 | -327.12 | 2.41333 | 1.79333 | -434.238 |
计算协方差矩阵:直接调用方法即可
1 data1 = np.mat(data) 2 data1 = data1.astype('float16') 3 4 # 计算协方差矩阵 5 covX = np.cov(data1.T) 6 covX
array([[ 2.17512816e+06, 3.39017180e+05, 5.64795310e+05, 3.66799624e+05, 4.18740435e+05, -8.14159678e+02, -7.37804742e+02, 7.53426315e+05], [ 3.39017180e+05, 7.42673545e+05, 1.47954656e+05, 8.10174225e+05, -5.98597476e+04, -4.10269647e+02, -9.69473039e+02, 1.82942939e+05], [ 5.64795310e+05, 1.47954656e+05, 1.62302951e+05, 2.10470018e+05, 7.98055667e+04, -2.28857444e+02, -2.74499481e+02, 1.86527121e+05], [ 3.66799624e+05, 8.10174225e+05, 2.10470018e+05, 1.71571948e+06, -2.14593340e+05, -3.56041630e+02, -1.33849453e+03, 7.91434689e+04], [ 4.18740435e+05, -5.98597476e+04, 7.98055667e+04, -2.14593340e+05, 2.11547288e+05, -2.35784325e+02, 1.89958719e+01, 1.77085901e+05], [-8.14159678e+02, -4.10269647e+02, -2.28857444e+02, -3.56041630e+02, -2.35784325e+02, 4.10102506e+00, 2.93249666e+00, -1.48336671e+02], [-7.37804742e+02, -9.69473039e+02, -2.74499481e+02, -1.33849453e+03, 1.89958719e+01, 2.93249666e+00, 3.60350766e+00, -2.13093770e+02], [ 7.53426315e+05, 1.82942939e+05, 1.86527121e+05, 7.91434689e+04, 1.77085901e+05, -1.48336671e+02, -2.13093770e+02, 3.41794042e+05]])
求特征值和特征向量:调用专有的方法
1 eig_val, eig_vec = np.linalg.eig(covX) 2 eig_pairs = [(np.abs(eig_val[i]), eig_vec[:, i]) for i in range(feature)] 3 eig_val
1 array([3.00989343e+06, 1.90990488e+06, 3.00900997e+05, 8.22843300e+04, 2 4.08689937e+04, 5.31590865e+03, 4.22552725e+00, 4.17682352e-01])
对特征值从大到小排序
1 index = np.argsort(-eig_val) 2 # 对特征值从大到小排序, 3 np.argsort(eig_val)
降维
k = 3 selectVec = np.matrix(eig_vec.T[index[:k]]) finalData = data1 * selectVec.T # (30, 8) * (8, 3) = (30, 3) finalData.shape finalData
1 matrix([[ 991.02791011, -2598.05143659, -442.07545458], 2 [ -145.34532529, -1734.65887357, 457.16158882], 3 [ 557.61661571, 1483.60034716, -93.14172337], 4 [-1197.70506441, 311.50193131, 20.75043138], 5 [-1691.41478306, 542.6983315 , 364.01896739], 6 [ 925.58426428, 922.66694957, 961.42758107], 7 [-1105.78714784, 282.27855119, 652.96927436], 8 [ -230.8764751 , 915.2909667 , 1154.93539315], 9 [ 3479.73362389, -3833.86882421, 1432.59322724], 10 [ 3279.28517562, 1443.82207397, -340.60839295], 11 [ 2020.42780485, -236.33723265, -368.15435258], 12 [ -459.95372537, 899.6555233 , -51.4321421 ], 13 [ 421.88446529, -480.08094189, 143.32609056], 14 [-1424.33703514, 671.74965252, 96.75028233], 15 [ 2712.10215068, 2166.91458621, -234.05003709], 16 [ 370.55219993, 1915.41058406, -103.89839129], 17 [ 75.40542303, 988.63141429, 171.25143746], 18 [ -179.38146112, 819.30680274, -18.32663648], 19 [ 4553.43459221, -648.74796439, -936.50316474], 20 [ -631.0710909 , 148.57756477, -260.22529535], 21 [-1543.48297854, -931.20547131, 148.74331019], 22 [ 963.19113519, 1714.01295905, -193.78601902], 23 [-1924.84798562, 212.88195738, -181.92664169], 24 [ -942.34945313, -52.02992477, -208.50404981], 25 [-1165.55041791, -2423.78738221, -1276.80765054], 26 [-1469.95701957, 456.57292109, 92.47839383], 27 [-1461.05431309, -532.72325834, -483.85639992], 28 [-1662.97477831, -1243.25114115, -294.80964414], 29 [-1974.30737332, -688.33978176, -78.22032709], 30 [-1139.6165708 , -491.32802526, -129.98141111]])