图聚类常用函数和数据集

1. 评价函数eval

本函数功能：计算每次训练acc,f1,nmi,ari

点击查看代码

import numpy as np
from munkres import Munkres, print_matrix
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi_score
from sklearn.metrics import adjusted_rand_score as ari_score
from scipy.optimize import linear_sum_assignment as linear
from sklearn import metrics


def cluster_acc(y_true, y_pred):
    y_true = y_true - np.min(y_true)

    l1 = list(set(y_true))
    numclass1 = len(l1)

    l2 = list(set(y_pred))
    numclass2 = len(l2)

    ind = 0
    if numclass1 != numclass2:
        for i in l1:
            if i in l2:
                pass
            else:
                y_pred[ind] = i
                ind += 1

    l2 = list(set(y_pred))
    numclass2 = len(l2)

    if numclass1 != numclass2:
        print('error')
        return

    cost = np.zeros((numclass1, numclass2), dtype=int)
    for i, c1 in enumerate(l1):
        mps = [i1 for i1, e1 in enumerate(y_true) if e1 == c1]
        for j, c2 in enumerate(l2):
            mps_d = [i1 for i1 in mps if y_pred[i1] == c2]
            cost[i][j] = len(mps_d)

    # match two clustering results by Munkres algorithm
    m = Munkres()
    cost = cost.__neg__().tolist()
    indexes = m.compute(cost)

    # get the match results
    new_predict = np.zeros(len(y_pred))
    for i, c in enumerate(l1):
        # correponding label in l2:
        c2 = l2[indexes[i][1]]

        # ai is the index with label==c2 in the pred_label list
        ai = [ind for ind, elm in enumerate(y_pred) if elm == c2]
        new_predict[ai] = c

    acc = metrics.accuracy_score(y_true, new_predict)
    f1_macro = metrics.f1_score(y_true, new_predict, average='macro')
    precision_macro = metrics.precision_score(y_true, new_predict, average='macro')
    recall_macro = metrics.recall_score(y_true, new_predict, average='macro')
    f1_micro = metrics.f1_score(y_true, new_predict, average='micro')
    precision_micro = metrics.precision_score(y_true, new_predict, average='micro')
    recall_micro = metrics.recall_score(y_true, new_predict, average='micro')
    return acc, f1_macro


def eva(y_true, y_pred, epoch='0'):
    acc, f1 = cluster_acc(y_true, y_pred)
    nmi = nmi_score(y_true, y_pred, average_method='arithmetic')
    ari = ari_score(y_true, y_pred)
    print("第"+epoch, ':acc {:.4f}'.format(acc), ', nmi {:.4f}'.format(nmi), ', ari {:.4f}'.format(ari),
            ', f1 {:.4f}'.format(f1))
    return acc,f1,nmi,ari

2.加载数据集(txt)

本函数功能：将txt数据类型转成属性矩阵和邻接矩阵

点击查看代码

# 当数据集是txt:load 加载函数
import numpy as np
import torch
from torch.utils.data import Dataset
import scipy.sparse as sp
# 接口介绍: Dataset ,DataLoader 是pytorch 提供预处理和加载数据的两个接口
"""
   先把原始数据转变成 torch.utils.data.Dataset 类，
随后再把得到的 torch.utils.data.Dataset 类,
当作一个参数传递给 torch.utils.data.DataLoader 类，
得到一个数据加载器，这个数据加载器每次可以返回一个 Batch 的数据供模型训练使用。
参考连接：https://blog.csdn.net/weixin_44211968/article/details/123744513
https://www.jb51.net/article/252552.htm
"""
class LoadDataset(Dataset):
    """
    自己定义的 dataset 类需要继承 Dataset。
    需要实现必要的魔法方法:
     在 __init__ 方法里面进行 读取数据文件 。
    在 __getitem__ 方法里支持通过下标访问数据。
    在 __len__ 方法里返回自定义数据集的大小，方便后期遍历。
    """
    def __init__(self, data):
        self.x = data

    def __len__(self):
        return self.x.shape[0] # 节点数量

    def __getitem__(self, idx):
        return torch.from_numpy(np.array(self.x[idx])).float(), \
               torch.from_numpy(np.array(idx))

def load_graph(k=False, graph_k_save_path="", graph_save_path="", data_path=""):
    """
    功能: 通过传入数据的graph.txt 和属性矩阵.txt 构建邻接矩阵adj:A
    :param k:  表示非图结构的数据 标志    true :非图数据结构   false : 图数据结构
    :param graph_k_save_path: 非图数据结构 graph.txt 文件路径
    :param graph_save_path:  图数据结构 graph.txt 文件路径
    :param data_path: : 特征(属性)矩阵   文件路径
    :return: 返回 结构矩阵adj :A
    """
    if k:
        path = graph_k_save_path
        print("加载非图数据结构的graph.txt路径:",path)
    else:
        path = graph_save_path
        print("加载图数据结构的graph.txt路径:", path)
    data = np.loadtxt(data_path, dtype=float)#读取属性矩阵 列如:cite.txt
    n, m= data.shape# 返回 n:节点数目  _m: 特征维度
    idx = np.array([i for i in range(n)], dtype=np.int32)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt(path, dtype=np.int32)# 读取graph.txt 文件
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])), shape=(n, n), dtype=np.float32)
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)
    adj = normalize(adj)# 归一化操作 这个操作换成GcnNormalize
    adj = sparse_mx_to_torch_sparse_tensor(adj)
    return adj

"""
代码中采用的归一化
gcn操作有两种: 
    第一种就是行归一化
    第二中就是gcn中的归一化
"""
def normalize(adj):
    """
    功能:行归一化
    :param mx: 邻接矩阵adj :A
    :return:  返回行归一化邻接矩阵adj
    """
    adj = adj + sp.eye(adj.shape[0])# A+IN sp.eye 和np.eye 等价
    # 归一化操作:就是采用这种行归一化操作
    rowsum = np.array(adj.sum(1)) # 求度矩阵D
    r_inv = np.power(rowsum, -1).flatten()# D^-1
    r_inv[np.isinf(r_inv)] = 0.# 将一些计算得到的NAN值赋0值
    r_mat_inv = sp.diags(r_inv)#D^-1对角化
    adj = r_mat_inv.dot(adj)#D^-1 A
    #adj=adj.dot(r_mat_inv)D^-1 A D^-1   gcn中的归一化但是一般采用的都是行归一化
    return adj
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """
    将scipy稀疏矩阵转换为torch稀疏张量
    :param sparse_mx:稀疏矩阵
    :return:torch稀疏张量
    """
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)
"""

scipy.sparse 矩阵转numpy
https://www.codenong.com/26576524/
"""

if __name__ == '__main__':
    x = np.loadtxt('data/cite.txt', dtype=float)#图属性矩阵(特征矩阵):X  3327 *3703   3327个节点,每个节点的特征是3703维
    y = np.loadtxt('data/cite_label.txt', dtype=int)# 返回标签:y  即label:y 返回上面 每一个节点对应的标签
    dataset=LoadDataset(x)# 通过自定义的LoadDataset 去加载数据 dataset.x 是属性图矩阵
    # 返回
    adj=load_graph(k=False, graph_k_save_path="", graph_save_path="./graph/cite_graph.txt", data_path="./data/cite.txt")

3.训练指标可视化

本函数功能:训练过程中的acc,f1,nmi,ari可视化

点击查看代码


import matplotlib.pyplot as plt
import numpy as np

import pyecharts.options as opts
from pyecharts.charts import Line
from pyecharts.globals import ThemeType
def EpochVision( x,y,modelname,arg,epoch='1'):
    a = np.array(x)
    b = np.array(y)
    plt.plot(a, b)
    plt.title(modelname+": "+arg)
    plt.xlabel('Epochs')
    plt.ylabel(arg)
    plt.savefig("./view/"+arg+epoch+".jpg")
    plt.close()



def EpochEchart( x,acc,f1,nmi,ari,modelname="模型名字",iters='1'):
    """
    函数功能： 把训练模型的acc,f1,nmi,ari 化成图，以html 的形式展示出来
    :param x: 横坐标
    纵坐标
    :param acc:   准确率 越大越好
    :param f1:  召回率
    :param nmi: 调整回信息  越大越好
    :param ari: 调整兰德指数   越大越好
    :param modelname: 模型名字
    :param arg: 调节模型中参数
    :param iters: 表示第几次运行模型
    :return:
    """
    line = (
        Line(
            init_opts=opts.InitOpts(width="1600px",
                                    height="700px",
                                    page_title="可视化epocch训练指标",
                                    theme=ThemeType.DARK)
        )
        .set_global_opts(

            tooltip_opts=opts.TooltipOpts(is_show=False),
            xaxis_opts=opts.AxisOpts(type_="category"),
            yaxis_opts=opts.AxisOpts(
                type_="value",
                axistick_opts=opts.AxisTickOpts(is_show=True),
                splitline_opts=opts.SplitLineOpts(is_show=True),
            ),
        )
        .add_xaxis(xaxis_data=x)
        .add_yaxis(
            series_name=modelname+" : "+"ACC",
            y_axis=acc,
            symbol="emptyCircle",
            is_symbol_show=True,
            is_smooth=True,

            label_opts=opts.LabelOpts(is_show=True),
        )
        .add_yaxis(
            series_name=modelname + " : " + "F1",
            y_axis=f1,
            symbol="emptyCircle",
            is_symbol_show=True,
            is_smooth=True,

            label_opts=opts.LabelOpts(is_show=True),
        )
        .add_yaxis(
            series_name=modelname + " : " + "NMI",
            y_axis=nmi,
            symbol="emptyCircle",
            is_symbol_show=True,
            is_smooth=True,

            label_opts=opts.LabelOpts(is_show=True),
        )
        .add_yaxis(
            series_name=modelname + " : " + "ARI",
            y_axis=ari,
            symbol="emptyCircle",
            is_symbol_show=True,
            is_smooth=True,

            label_opts=opts.LabelOpts(is_show=True),
        )
        .render("./"+modelname+iters+".html")

    )
if __name__ == '__main__':
    x, y = np.arange(0.4, 0.7, .03), np.arange(0, 1, .05)

4.可视化聚类效果

本函数功能：将节点聚类效果可视化

点击查看代码



import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE


def t_sne(embeds, labels, sample_num=2000, show_fig=True,device=True):
    """
    visualize embedding by t-SNE algorithm
    :param embeds: embedding of the data： 嵌入向量
    :param labels: labels: 标签
    :param sample_num: the num of samples： 样本数量
    :param show_fig: if show the figure：是否展示图
	:param device: True :cpu    False :cuda gpu 
    :return fig: figure
    """
	if not device:
		embeds=embeds.cpu()

    # sampling
    sample_index = np.random.randint(0, embeds.shape[0], sample_num)
    sample_embeds = embeds[sample_index]
    sample_labels = labels[sample_index]

    # t-SNE
    ts = TSNE(n_components=2, init='pca', random_state=0)
    ts_embeds = ts.fit_transform(sample_embeds[:, :])

    # remove outlier
    mean, std = np.mean(ts_embeds, axis=0), np.std(ts_embeds, axis=0)
    for i in range(len(ts_embeds)):
        if (ts_embeds[i] - mean < 3 * std).all():
            np.delete(ts_embeds, i)

    # normalization
    x_min, x_max = np.min(ts_embeds, 0), np.max(ts_embeds, 0)
    norm_ts_embeds = (ts_embeds - x_min) / (x_max - x_min)

    # plot
    fig = plt.figure()
    for i in range(norm_ts_embeds.shape[0]):
        plt.text(norm_ts_embeds[i, 0], norm_ts_embeds[i, 1], str(sample_labels[i]),
                 color=plt.cm.Set1(sample_labels[i] % 7),
                 fontdict={'weight': 'bold', 'size': 7})
    plt.xticks([])
    plt.yticks([])
    plt.title('t-SNE', fontsize=14)
    plt.axis('off')
    if show_fig:
        plt.show()

    return fig


def similarity_plot(embedding, label, sample_num=1000, show_fig=True,device=True):
    """
    show cosine similarity of embedding or x
    :param embedding: the input embedding：嵌入向量
    :param label: the ground truth： 真实标签
    :param sample_num: sample number：样本数量
    :param show_fig: if show the figure
    :return fig: the figure
    """
	if not devic:
		embedding=embedding.cpu()
    # sampling
    label_sample = label[:sample_num]
    embedding_sample = embedding[:sample_num, :]

    # sort the embedding based on label
    cat = np.concatenate([embedding_sample, label_sample.reshape(-1, 1)], axis=1)
    arg_sort = np.argsort(label_sample)
    cat = cat[arg_sort]
    embedding_sample = cat[:, :-1]

    # cosine similarity
    norm_embedding_sample = embedding_sample / np.sqrt(np.sum(embedding_sample ** 2, axis=1)).reshape(-1, 1)
    cosine_sim = np.matmul(norm_embedding_sample, norm_embedding_sample.transpose())
    cosine_sim[cosine_sim < 1e-5] = 0

    # figure
    fig = plt.figure()
    sns.heatmap(data=cosine_sim, cmap="RdBu_r", vmin=-1, vmax=1)
    plt.axis("off")

    # plot
    if show_fig:
        plt.show()
    return fig