图聚类常用函数和数据集
图聚类常用函数和数据集
1. 评价函数eval
本函数功能:计算每次训练acc,f1,nmi,ari
点击查看代码
import numpy as np
from munkres import Munkres, print_matrix
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score
from sklearn.metrics import adjusted_rand_score as ari_score
from scipy.optimize import linear_sum_assignment as linear
from sklearn import metrics
def cluster_acc(y_true, y_pred):
y_true = y_true - np.min(y_true)
l1 = list(set(y_true))
numclass1 = len(l1)
l2 = list(set(y_pred))
numclass2 = len(l2)
ind = 0
if numclass1 != numclass2:
for i in l1:
if i in l2:
pass
else:
y_pred[ind] = i
ind += 1
l2 = list(set(y_pred))
numclass2 = len(l2)
if numclass1 != numclass2:
print('error')
return
cost = np.zeros((numclass1, numclass2), dtype=int)
for i, c1 in enumerate(l1):
mps = [i1 for i1, e1 in enumerate(y_true) if e1 == c1]
for j, c2 in enumerate(l2):
mps_d = [i1 for i1 in mps if y_pred[i1] == c2]
cost[i][j] = len(mps_d)
# match two clustering results by Munkres algorithm
m = Munkres()
cost = cost.__neg__().tolist()
indexes = m.compute(cost)
# get the match results
new_predict = np.zeros(len(y_pred))
for i, c in enumerate(l1):
# correponding label in l2:
c2 = l2[indexes[i][1]]
# ai is the index with label==c2 in the pred_label list
ai = [ind for ind, elm in enumerate(y_pred) if elm == c2]
new_predict[ai] = c
acc = metrics.accuracy_score(y_true, new_predict)
f1_macro = metrics.f1_score(y_true, new_predict, average='macro')
precision_macro = metrics.precision_score(y_true, new_predict, average='macro')
recall_macro = metrics.recall_score(y_true, new_predict, average='macro')
f1_micro = metrics.f1_score(y_true, new_predict, average='micro')
precision_micro = metrics.precision_score(y_true, new_predict, average='micro')
recall_micro = metrics.recall_score(y_true, new_predict, average='micro')
return acc, f1_macro
def eva(y_true, y_pred, epoch='0'):
acc, f1 = cluster_acc(y_true, y_pred)
nmi = nmi_score(y_true, y_pred, average_method='arithmetic')
ari = ari_score(y_true, y_pred)
print("第"+epoch, ':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari),
', f1 {:.4f}'.format(f1))
return acc,f1,nmi,ari
2.加载数据集(txt)
本函数功能:将txt数据类型转成属性矩阵和邻接矩阵
点击查看代码
# 当数据集是txt:load 加载函数
import numpy as np
import torch
from torch.utils.data import Dataset
import scipy.sparse as sp
# 接口介绍: Dataset ,DataLoader 是pytorch 提供预处理和加载数据的两个接口
"""
先把原始数据转变成 torch.utils.data.Dataset 类,
随后再把得到的 torch.utils.data.Dataset 类,
当作一个参数传递给 torch.utils.data.DataLoader 类,
得到一个数据加载器,这个数据加载器每次可以返回一个 Batch 的数据供模型训练使用。
参考连接:https://blog.csdn.net/weixin_44211968/article/details/123744513
https://www.jb51.net/article/252552.htm
"""
class LoadDataset(Dataset):
"""
自己定义的 dataset 类需要继承 Dataset。
需要实现必要的魔法方法:
在 __init__ 方法里面进行 读取数据文件 。
在 __getitem__ 方法里支持通过下标访问数据。
在 __len__ 方法里返回自定义数据集的大小,方便后期遍历。
"""
def __init__(self, data):
self.x = data
def __len__(self):
return self.x.shape[0] # 节点数量
def __getitem__(self, idx):
return torch.from_numpy(np.array(self.x[idx])).float(), \
torch.from_numpy(np.array(idx))
def load_graph(k=False, graph_k_save_path="", graph_save_path="", data_path=""):
"""
功能: 通过传入数据的graph.txt 和属性矩阵.txt 构建邻接矩阵adj:A
:param k: 表示非图结构的数据 标志 true :非图数据结构 false : 图数据结构
:param graph_k_save_path: 非图数据结构 graph.txt 文件路径
:param graph_save_path: 图数据结构 graph.txt 文件路径
:param data_path: : 特征(属性)矩阵 文件路径
:return: 返回 结构矩阵adj :A
"""
if k:
path = graph_k_save_path
print("加载非图数据结构的graph.txt路径:",path)
else:
path = graph_save_path
print("加载图数据结构的graph.txt路径:", path)
data = np.loadtxt(data_path, dtype=float)#读取属性矩阵 列如:cite.txt
n, m= data.shape# 返回 n:节点数目 _m: 特征维度
idx = np.array([i for i in range(n)], dtype=np.int32)
idx_map = {j: i for i, j in enumerate(idx)}
edges_unordered = np.genfromtxt(path, dtype=np.int32)# 读取graph.txt 文件
edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
dtype=np.int32).reshape(edges_unordered.shape)
adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])), shape=(n, n), dtype=np.float32)
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
adj = normalize(adj)# 归一化操作 这个操作换成GcnNormalize
adj = sparse_mx_to_torch_sparse_tensor(adj)
return adj
"""
代码中采用的归一化
gcn操作有两种:
第一种就是行归一化
第二中就是gcn中的归一化
"""
def normalize(adj):
"""
功能:行归一化
:param mx: 邻接矩阵adj :A
:return: 返回行归一化邻接矩阵adj
"""
adj = adj + sp.eye(adj.shape[0])# A+IN sp.eye 和np.eye 等价
# 归一化操作:就是采用这种行归一化操作
rowsum = np.array(adj.sum(1)) # 求度矩阵D
r_inv = np.power(rowsum, -1).flatten()# D^-1
r_inv[np.isinf(r_inv)] = 0.# 将一些计算得到的NAN值赋0值
r_mat_inv = sp.diags(r_inv)#D^-1对角化
adj = r_mat_inv.dot(adj)#D^-1 A
#adj=adj.dot(r_mat_inv)D^-1 A D^-1 gcn中的归一化但是一般采用的都是行归一化
return adj
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
"""
将scipy稀疏矩阵转换为torch稀疏张量
:param sparse_mx:稀疏矩阵
:return:torch稀疏张量
"""
sparse_mx = sparse_mx.tocoo().astype(np.float32)
indices = torch.from_numpy(
np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
values = torch.from_numpy(sparse_mx.data)
shape = torch.Size(sparse_mx.shape)
return torch.sparse.FloatTensor(indices, values, shape)
"""
scipy.sparse 矩阵转numpy
https://www.codenong.com/26576524/
"""
if __name__ == '__main__':
x = np.loadtxt('data/cite.txt', dtype=float)#图属性矩阵(特征矩阵):X 3327 *3703 3327个节点,每个节点的特征是3703维
y = np.loadtxt('data/cite_label.txt', dtype=int)# 返回标签:y 即label:y 返回上面 每一个节点对应的标签
dataset=LoadDataset(x)# 通过自定义的LoadDataset 去加载数据 dataset.x 是属性图矩阵
# 返回
adj=load_graph(k=False, graph_k_save_path="", graph_save_path="./graph/cite_graph.txt", data_path="./data/cite.txt")
3.训练指标可视化
本函数功能:训练过程中的acc,f1,nmi,ari可视化
点击查看代码
import matplotlib.pyplot as plt
import numpy as np
import pyecharts.options as opts
from pyecharts.charts import Line
from pyecharts.globals import ThemeType
def EpochVision( x,y,modelname,arg,epoch='1'):
a = np.array(x)
b = np.array(y)
plt.plot(a, b)
plt.title(modelname+": "+arg)
plt.xlabel('Epochs')
plt.ylabel(arg)
plt.savefig("./view/"+arg+epoch+".jpg")
plt.close()
def EpochEchart( x,acc,f1,nmi,ari,modelname="模型名字",iters='1'):
"""
函数功能: 把训练模型的acc,f1,nmi,ari 化成图,以html 的形式展示出来
:param x: 横坐标
纵坐标
:param acc: 准确率 越大越好
:param f1: 召回率
:param nmi: 调整回信息 越大越好
:param ari: 调整兰德指数 越大越好
:param modelname: 模型名字
:param arg: 调节模型中参数
:param iters: 表示第几次运行模型
:return:
"""
line = (
Line(
init_opts=opts.InitOpts(width="1600px",
height="700px",
page_title="可视化epocch训练指标",
theme=ThemeType.DARK)
)
.set_global_opts(
tooltip_opts=opts.TooltipOpts(is_show=False),
xaxis_opts=opts.AxisOpts(type_="category"),
yaxis_opts=opts.AxisOpts(
type_="value",
axistick_opts=opts.AxisTickOpts(is_show=True),
splitline_opts=opts.SplitLineOpts(is_show=True),
),
)
.add_xaxis(xaxis_data=x)
.add_yaxis(
series_name=modelname+" : "+"ACC",
y_axis=acc,
symbol="emptyCircle",
is_symbol_show=True,
is_smooth=True,
label_opts=opts.LabelOpts(is_show=True),
)
.add_yaxis(
series_name=modelname + " : " + "F1",
y_axis=f1,
symbol="emptyCircle",
is_symbol_show=True,
is_smooth=True,
label_opts=opts.LabelOpts(is_show=True),
)
.add_yaxis(
series_name=modelname + " : " + "NMI",
y_axis=nmi,
symbol="emptyCircle",
is_symbol_show=True,
is_smooth=True,
label_opts=opts.LabelOpts(is_show=True),
)
.add_yaxis(
series_name=modelname + " : " + "ARI",
y_axis=ari,
symbol="emptyCircle",
is_symbol_show=True,
is_smooth=True,
label_opts=opts.LabelOpts(is_show=True),
)
.render("./"+modelname+iters+".html")
)
if __name__ == '__main__':
x, y = np.arange(0.4, 0.7, .03), np.arange(0, 1, .05)
4.可视化聚类效果
本函数功能:将节点聚类效果可视化
点击查看代码
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
def t_sne(embeds, labels, sample_num=2000, show_fig=True,device=True):
"""
visualize embedding by t-SNE algorithm
:param embeds: embedding of the data: 嵌入向量
:param labels: labels: 标签
:param sample_num: the num of samples: 样本数量
:param show_fig: if show the figure:是否展示图
:param device: True :cpu False :cuda gpu
:return fig: figure
"""
if not device:
embeds=embeds.cpu()
# sampling
sample_index = np.random.randint(0, embeds.shape[0], sample_num)
sample_embeds = embeds[sample_index]
sample_labels = labels[sample_index]
# t-SNE
ts = TSNE(n_components=2, init='pca', random_state=0)
ts_embeds = ts.fit_transform(sample_embeds[:, :])
# remove outlier
mean, std = np.mean(ts_embeds, axis=0), np.std(ts_embeds, axis=0)
for i in range(len(ts_embeds)):
if (ts_embeds[i] - mean < 3 * std).all():
np.delete(ts_embeds, i)
# normalization
x_min, x_max = np.min(ts_embeds, 0), np.max(ts_embeds, 0)
norm_ts_embeds = (ts_embeds - x_min) / (x_max - x_min)
# plot
fig = plt.figure()
for i in range(norm_ts_embeds.shape[0]):
plt.text(norm_ts_embeds[i, 0], norm_ts_embeds[i, 1], str(sample_labels[i]),
color=plt.cm.Set1(sample_labels[i] % 7),
fontdict={'weight': 'bold', 'size': 7})
plt.xticks([])
plt.yticks([])
plt.title('t-SNE', fontsize=14)
plt.axis('off')
if show_fig:
plt.show()
return fig
def similarity_plot(embedding, label, sample_num=1000, show_fig=True,device=True):
"""
show cosine similarity of embedding or x
:param embedding: the input embedding:嵌入向量
:param label: the ground truth: 真实标签
:param sample_num: sample number:样本数量
:param show_fig: if show the figure
:return fig: the figure
"""
if not devic:
embedding=embedding.cpu()
# sampling
label_sample = label[:sample_num]
embedding_sample = embedding[:sample_num, :]
# sort the embedding based on label
cat = np.concatenate([embedding_sample, label_sample.reshape(-1, 1)], axis=1)
arg_sort = np.argsort(label_sample)
cat = cat[arg_sort]
embedding_sample = cat[:, :-1]
# cosine similarity
norm_embedding_sample = embedding_sample / np.sqrt(np.sum(embedding_sample ** 2, axis=1)).reshape(-1, 1)
cosine_sim = np.matmul(norm_embedding_sample, norm_embedding_sample.transpose())
cosine_sim[cosine_sim < 1e-5] = 0
# figure
fig = plt.figure()
sns.heatmap(data=cosine_sim, cmap="RdBu_r", vmin=-1, vmax=1)
plt.axis("off")
# plot
if show_fig:
plt.show()
return fig
5.数据集
txt数据集:https://github.com/bdy9527/SDCN
https://github.com/bdy9527/SDCN/tree/master
Baidu Netdisk:
graph: 链接:https://pan.baidu.com/s/1MEWr1KyrtBQndVNy8_y2Lw 密码:opc1
data: 链接:https://pan.baidu.com/s/1kqoWlElbWazJyrTdv1sHNg 密码:1gd4
Google Drive:
graph: https://drive.google.com/file/d/10rnVwIAuVRczmZJSX7mpSTR0-HVnMWLh/view?usp=sharing
data: https://drive.google.com/file/d/1VjH6xqt82GaQwwiy-4O2GedMgQMLN6dm/view?usp=sharing
npy类型数据集:https://github.com/yueliu1999/Awesome-Deep-Graph-Clustering#benchmark-datasets
npy数据处理:https://blog.csdn.net/qq_51392112/article/details/129429108
6.google云盘和colab
google 团队网盘申请:https://www.iculture.cc/knowledge/pig=7974
colab使用技巧:
http://element-ui.cn/article/show-119360.html?action=onClick
https://blog.csdn.net/qq_43684592/article/details/116302893
https://blog.csdn.net/Xuxianmincs/article/details/89601122
colab使用技巧:
http://element-ui.cn/article/show-119360.html?action=onClick
https://blog.csdn.net/qq_43684592/article/details/116302893
https://blog.csdn.net/Xuxianmincs/article/details/89601122
colab gpu 训练报错::
https://www.cnblogs.com/seansheep/p/16020753.html
https://www.cnblogs.com/booturbo/p/16341650.html
https://blog.csdn.net/hshudoudou/article/details/127383111
7.深度图聚类框架
一个可扩展的深度属性图聚类的统一代码框架:https://github.com/Marigoldwu/A-Unified-Framework-for-Deep-Attribute-Graph-Clustering
本文来自博客园,作者:我爱读论文,转载请注明原文链接:https://www.cnblogs.com/life1314/p/17323734.html