tsne pca 自编码器 绘图(CC2)——一定记得做无量纲化处理使用standardscaler,数据聚类更明显

tsne

数据不做预处理:

1
2
3
4
5
6
7
8
# coding: utf-8
import collections
import numpy as np
import os
import pickle
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.manifold import TSNE

 

1
2
3
4
5
6
7
8
9
10
11
12
13
# .......
X = X+black_verify+white_verify+unknown_verify+bd_verify
print black_verify_labels+white_verify_labels+unknown_verify_labels+bd_verify_labels
y = y+black_verify_labels+white_verify_labels+unknown_verify_labels+bd_verify_labels
print("ALL data check:")
print("len of X:", len(X))
print("len of y:", len(y))
# print(unknown_verify)
 
X_embedded = TSNE(n_components=2).fit_transform(X)
 
with open("tsne_data_X.pkl", "wb") as f:
    pickle.dump([X_embedded, y], f)

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import pickle
from collections import Counter
import numpy as np
import matplotlib.pyplot as Plot
 
def main():
    with open("tsne_data_X.pkl", "rb") as f:
        [X_embedded, y] = pickle.load(f, encoding='iso-8859-1')
 
    print(len(X_embedded))
    print(len(y))
    print(X_embedded[:3])
    print(y[:3])
    i = 0
    for l in y:
        if type(l) == type([]):
            raise Exception(str([i,y]))
        i+=1
    print(Counter(y))
    Y, labels = np.array(X_embedded), np.array(y)
    titles = ("white","black","black_verify_labels","white_verify_labels","unknown_verify_labels","bd_verify_labels")
    colors=['b', 'c', 'y', 'm', 'r', 'g', 'peru']
    for i in range(0, 6):
       idx_1 = [i1 for i1 in range(len(labels)) if labels[i1]==i]
       flg1=Plot.scatter(Y[idx_1,0], Y[idx_1,1], 20,color=colors[i],label=titles[i]);
    Plot.legend()
    Plot.savefig('tsne.pdf')
    Plot.show()
main()

 

 

数据做standard标准化处理

使用pca,不进行预处理:

使用standard scaler预处理,再做pca:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X)
#scaler = preprocessing.MinMaxScaler().fit(X)
X = scaler.transform(X)
print("standard X sample:", X[:3])
 
black_verify = scaler.transform(black_verify)
print(black_verify)
 
white_verify = scaler.transform(white_verify)
print(white_verify)
 
unknown_verify = scaler.transform(unknown_verify)
print(unknown_verify)
 
bd_verify = scaler.transform(bd_verify)
print(bd_verify)
 
#print black_verify_labels+white_verify_labels+unknown_verify_labels+bd_verify_labels
 
X = np.concatenate((X,black_verify,white_verify,unknown_verify,bd_verify))
#X = X+black_verify+white_verify+unknown_verify+bd_verify
y = y+black_verify_labels+white_verify_labels+unknown_verify_labels+bd_verify_labels
print("ALL data check:")
print("len of X:", len(X))
print("len of y:", len(y))
# print(unknown_verify)
 
X_embedded = PCA(n_components=2).fit_transform(X)
 
with open("pca_data_X_scaled.pkl", "wb") as f:
    pickle.dump([X_embedded, y], f)

 

最后效果:

最后使用自编码器来来降维:

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
X = np.concatenate((X,black_verify,white_verify,unknown_verify,bd_verify))
y = y+black_verify_labels+white_verify_labels+unknown_verify_labels+bd_verify_labels
print("ALL data check:")
print("len of X:", len(X))
print("len of y:", len(y))
# print(unknown_verify)
 
ratio_of_train = 0.8
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - ratio_of_train))
# Building the encoder
encoder = tflearn.input_data(shape=[None, 75])
encoder = tflearn.fully_connected(encoder, 64)
encoder = tflearn.fully_connected(encoder, 2)
 
# Building the decoder
decoder = tflearn.fully_connected(encoder, 64)
decoder = tflearn.fully_connected(decoder, 75, activation='sigmoid')
 
# Regression, with mean square error
net = tflearn.regression(decoder, optimizer='adam', learning_rate=0.0001,
                         loss='mean_square', metric=None)
 
# Training the auto encoder
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(X_train, X_train, n_epoch=200, validation_set=(X_test, X_test),
          run_id="auto_encoder", batch_size=1024)
 
# Encoding X[0] for test
print("\nTest encoding of X[0]:")
# New model, re-using the same session, for weights sharing
encoding_model = tflearn.DNN(encoder, session=model.session)
print(encoding_model.predict([X[0]]))
 
 
X_embedded = encoding_model.predict(X) #TSNE(n_components=2).fit_transform(X)
 
with open("tflearn_auto_enc_data_X_scaled.pkl", "wb") as f:
    pickle.dump([X_embedded, y], f)

如果是迭代次数不一样,则可能有一些差别,见下图,和上面的可能有些差别:

 

修改64为128:

 

posted @   bonelee  阅读(1520)  评论(1编辑  收藏  举报
编辑推荐:
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
阅读排行:
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· DeepSeek 开源周回顾「GitHub 热点速览」
点击右上角即可分享
微信分享提示