Keras版GCN源码解析
直接上代码:
后面会在这份源码的基础上做实验;
TensorFlow版的GCN源码也看过了,但是看不太懂,欢迎交流GCN相关内容。
1 setup.py
from setuptools import setup from setuptools import find_packages setup(name='kegra', # 生成的包名称 version='0.0.1', # 版本号 description='Deep Learning on Graphs with Keras', # 包的简要描述 author='Thomas Kipf', # 包的作者 author_email='thomas.kipf@gmail.com', # 包作者的邮箱地址 url='https://tkipf.github.io', # 程序的官网地址 download_url='...', # 程序的下载地址 license='MIT', # 程序的授权信息 install_requires=['keras'], # 需要安装的依赖包 extras_require={ # 额外用于模型存储的依赖包 'model_saving': ['json', 'h5py'], }, package_data={'kegra': ['README.md']}, # fine_packages()函数默认在和setup.py同一目录下搜索各个含有__init__.py的包 packages=find_packages())
2 utils.py
# 如果某个版本中出现了某个新的功能特性,而且这个特性和当前版本中使用的不兼容, # 也就是说它在当前版本中不是语言标准,那么我们如果想要使用的话就要从__future__模块导入 from __future__ import print_function # print()函数 import scipy.sparse as sp # python中稀疏矩阵相关库 import numpy as np # python中操作数组的函数 from scipy.sparse.linalg.eigen.arpack import eigsh, ArpackNoConvergence # 稀疏矩阵中查找特征值/特征向量的函数 # 将标签转换为one-hot编码形式 def encode_onehot(labels): # set()函数创建一个不重复元素集合 classes = set(labels) # np.identity()函数创建方针,返回主对角线元素为1,其余元素为0的数组 # enumerate()函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列 # 同时列出数据和数据下标,一般用在for循环中 classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)} # map()函数根据提供的函数对指定序列做映射 # map(function, iterable) # 第一个参数function以参数序列中的每一个元素调用function函数,返回包含每次function函数返回值的新列表 labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32) return labels_onehot # 加载数据 def load_data(path="data/cora/", dataset="cora"): """Load citation network dataset (cora only for now)""" # str.format()函数用于格式化字符串 print('Loading {} dataset...'.format(dataset)) # np.genfromtxt()函数用于从.csv文件或.tsv文件中生成数组 # np.genfromtxt(fname, dtype, delimiter, usecols, skip_header) # frame:文件名 # dtype:数据类型 # delimiter:分隔符 # usecols:选择读哪几列,通常将属性集读为一个数组,将标签读为一个数组 # skip_header:是否跳过表头 idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset), dtype=np.dtype(str)) # 提取样本的特征,并将其转换为csr矩阵(压缩稀疏行矩阵),用行索引、列索引和值表示矩阵 features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32) # 提取样本的标签,并将其转换为one-hot编码形式 labels = encode_onehot(idx_features_labels[:, -1]) # build graph # 样本的id数组 idx = np.array(idx_features_labels[:, 0], dtype=np.int32) # 有样本id到样本索引的映射字典 idx_map = {j: i for i, j in enumerate(idx)} # 样本之间的引用关系数组 edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset), dtype=np.int32) # 将样本之间的引用关系用样本索引之间的关系表示 edges = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape) # 构建图的邻接矩阵,用坐标形式的稀疏矩阵表示,非对称邻接矩阵 adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])), shape=(labels.shape[0], labels.shape[0]), dtype=np.float32) # build symmetric adjacency matrix # 将非对称邻接矩阵转变为对称邻接矩阵 adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) # 打印消息:数据集有多少个节点、多少条边、每个样本有多少维特征 print('Dataset has {} nodes, {} edges, {} features.'.format(adj.shape[0], edges.shape[0], features.shape[1])) # 返回特征的密集矩阵表示、邻接矩阵和标签的one-hot编码 return features.todense(), adj, labels # 对邻接矩阵进行归一化处理 def normalize_adj(adj, symmetric=True): # 如果邻接矩阵为对称矩阵,得到对称归一化邻接矩阵 # D^(-1/2) * A * D^(-1/2) if symmetric: # A.sum(axis=1):计算矩阵的每一行元素之和,得到节点的度矩阵D # np.power(x, n):数组元素求n次方,得到D^(-1/2) # sp.diags()函数根据给定的对象创建对角矩阵,对角线上的元素为给定对象中的元素 d = sp.diags(np.power(np.array(adj.sum(1)), -0.5).flatten(), 0) # tocsr()函数将矩阵转化为压缩稀疏行矩阵 a_norm = adj.dot(d).transpose().dot(d).tocsr() # 如果邻接矩阵不是对称矩阵,得到随机游走正则化拉普拉斯算子 # D^(-1) * A else: d = sp.diags(np.power(np.array(adj.sum(1)), -1).flatten(), 0) a_norm = d.dot(adj).tocsr() return a_norm # 在邻接矩阵中加入自连接 def preprocess_adj(adj, symmetric=True): adj = adj + sp.eye(adj.shape[0]) # 对加入自连接的邻接矩阵进行对称归一化处理 adj = normalize_adj(adj, symmetric) return adj # 构造样本掩码 def sample_mask(idx, l): """ :param idx: 有标签样本的索引列表 :param l: 所有样本数量 :return: 布尔类型数组,其中有标签样本所对应的位置为True,无标签样本所对应的位置为False """ # np.zeros()函数创建一个给定形状和类型的用0填充的数组 mask = np.zeros(l) mask[idx] = 1 return np.array(mask, dtype=np.bool) # 数据集划分 def get_splits(y): # 训练集索引列表 idx_train = range(140) # 验证集索引列表 idx_val = range(200, 500) # 测试集索引列表 idx_test = range(500, 1500) # 训练集样本标签 y_train = np.zeros(y.shape, dtype=np.int32) # 验证集样本标签 y_val = np.zeros(y.shape, dtype=np.int32) # 测试集样本标签 y_test = np.zeros(y.shape, dtype=np.int32) y_train[idx_train] = y[idx_train] y_val[idx_val] = y[idx_val] y_test[idx_test] = y[idx_test] # 训练数据的样本掩码 train_mask = sample_mask(idx_train, y.shape[0]) return y_train, y_val, y_test, idx_train, idx_val, idx_test, train_mask # 定义分类交叉熵 def categorical_crossentropy(preds, labels): """ :param preds:模型对样本的输出数组 :param labels:样本的one-hot标签数组 :return:样本的平均交叉熵损失 """ # np.extract(condition, x)函数,根据某个条件从数组中抽取元素 # np.mean()函数默认求数组中所有元素均值 return np.mean(-np.log(np.extract(labels, preds))) # 定义准确率函数 def accuracy(preds, labels): # np.argmax(x)函数取出x中元素最大值所对应的索引 # np.equal(x1, x2)函数用于在元素级比较两个数组是否相等 return np.mean(np.equal(np.argmax(labels, 1), np.argmax(preds, 1))) # 评估样本划分的损失函数和准确率 def evaluate_preds(preds, labels, indices): """ :param preds:对于样本的预测值 :param labels:样本的标签one-hot向量 :param indices:样本的索引集合 :return:交叉熵损失函数列表、准确率列表 """ split_loss = list() split_acc = list() for y_split, idx_split in zip(labels, indices): # 计算每一个样本划分的交叉熵损失函数 split_loss.append(categorical_crossentropy(preds[idx_split], y_split[idx_split])) # 计算每一个样本划分的准确率 split_acc.append(accuracy(preds[idx_split], y_split[idx_split])) return split_loss, split_acc # 对拉普拉斯矩阵进行归一化处理 def normalized_laplacian(adj, symmetric=True): # 对称归一化的邻接矩阵,D ^ (-1/2) * A * D ^ (-1/2) adj_normalized = normalize_adj(adj, symmetric) # 得到对称规范化的图拉普拉斯矩阵,L = I - D ^ (-1/2) * A * D ^ (-1/2) laplacian = sp.eye(adj.shape[0]) - adj_normalized return laplacian # 重新调整对称归一化的图拉普拉斯矩阵,得到其简化版本 def rescale_laplacian(laplacian): try: print('Calculating largest eigenvalue of normalized graph Laplacian...') # 计算对称归一化图拉普拉斯矩阵的最大特征值 largest_eigval = eigsh(laplacian, 1, which='LM', return_eigenvectors=False)[0] # 如果计算过程不收敛 except ArpackNoConvergence: print('Eigenvalue calculation did not converge! Using largest_eigval=2 instead.') largest_eigval = 2 # 调整后的对称归一化图拉普拉斯矩阵,L~ = 2 / Lambda * L - I scaled_laplacian = (2. / largest_eigval) * laplacian - sp.eye(laplacian.shape[0]) return scaled_laplacian # 计算直到k阶的切比雪夫多项式 def chebyshev_polynomial(X, k): # 返回一个稀疏矩阵列表 """Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices.""" print("Calculating Chebyshev polynomials up to order {}...".format(k)) T_k = list() T_k.append(sp.eye(X.shape[0]).tocsr()) # T0(X) = I T_k.append(X) # T1(X) = L~ # 定义切比雪夫递归公式 def chebyshev_recurrence(T_k_minus_one, T_k_minus_two, X): """ :param T_k_minus_one: T(k-1)(L~) :param T_k_minus_two: T(k-2)(L~) :param X: L~ :return: Tk(L~) """ # 将输入转化为csr矩阵(压缩稀疏行矩阵) X_ = sp.csr_matrix(X, copy=True) # 递归公式:Tk(L~) = 2L~ * T(k-1)(L~) - T(k-2)(L~) return 2 * X_.dot(T_k_minus_one) - T_k_minus_two for i in range(2, k+1): T_k.append(chebyshev_recurrence(T_k[-1], T_k[-2], X)) # 返回切比雪夫多项式列表 return T_k # 将稀疏矩阵转化为元组表示 def sparse_to_tuple(sparse_mx): if not sp.isspmatrix_coo(sparse_mx): # 将稀疏矩阵转化为coo矩阵形式 # coo矩阵采用三个数组分别存储行、列和非零元素值的信息 sparse_mx = sparse_mx.tocoo() # np.vstack()函数沿着数组的某条轴堆叠数组 # 获取非零元素的位置索引 coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose() # 获取矩阵的非零元素 values = sparse_mx.data # 获取矩阵的形状 shape = sparse_mx.shape return coords, values, shape
3 graph.py
# 如果某个版本中出现了某个新的功能特性,而且这个特性和当前版本中使用的不兼容, # 也就是说它在当前版本中不是语言标准,那么我们如果想要使用的话就要从__future__模块导入 from __future__ import print_function # print()函数 from keras import activations, initializers, constraints from keras import regularizers from keras.engine import Layer import keras.backend as K # 定义基本的图卷积类 # Keras自定义层要实现build方法、call方法和compute_output_shape(input_shape)方法 class GraphConvolution(Layer): """Basic graph convolution layer as in https://arxiv.org/abs/1609.02907""" # 构造函数 def __init__(self, units, support=1, activation=None, use_bias=True, kernel_initializer='glorot_uniform', bias_initializer='zeros', kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, bias_constraint=None, **kwargs): if 'input_shape' not in kwargs and 'input_dim' in kwargs: # pop()函数用于删除列表中某元素,并返回该元素的值 kwargs['input_shape'] = (kwargs.pop('input_dim'),) super(GraphConvolution, self).__init__(**kwargs) self.units = units self.activation = activations.get(activation) self.use_bias = use_bias self.kernel_initializer = initializers.get(kernel_initializer) self.bias_initializer = initializers.get(bias_initializer) # 施加在权重上的正则项 self.kernel_regularizer = regularizers.get(kernel_regularizer) # 施加在偏置向量上的正则项 self.bias_regularizer = regularizers.get(bias_regularizer) # 施加在输出上的正则项 self.activity_regularizer = regularizers.get(activity_regularizer) # 对主权重矩阵进行约束 self.kernel_constraint = constraints.get(kernel_constraint) # 对偏置向量进行约束 self.bias_constraint = constraints.get(bias_constraint) self.supports_masking = True self.support = support assert support >= 1 # 计算输出的形状 # 如果自定义层更改了输入张量的形状,则应该在这里定义形状变化的逻辑 # 让Keras能够自动推断各层的形状 def compute_output_shape(self, input_shapes): # 特征矩阵形状 features_shape = input_shapes[0] # 输出形状为(批大小, 输出维度) output_shape = (features_shape[0], self.units) return output_shape # (batch_size, output_dim) # 定义层中的参数 def build(self, input_shapes): # 特征矩阵形状 features_shape = input_shapes[0] assert len(features_shape) == 2 # 特征维度 input_dim = features_shape[1] self.kernel = self.add_weight(shape=(input_dim * self.support, self.units), initializer=self.kernel_initializer, name='kernel', regularizer=self.kernel_regularizer, constraint=self.kernel_constraint) # 如果存在偏置 if self.use_bias: self.bias = self.add_weight(shape=(self.units,), initializer=self.bias_initializer, name='bias', regularizer=self.bias_regularizer, constraint=self.bias_constraint) else: self.bias = None # 必须设定self.bulit = True self.built = True # 编写层的功能逻辑 def call(self, inputs, mask=None): features = inputs[0] # 特征 basis = inputs[1:] # 对称归一化的邻接矩阵 # 多个图的情况 supports = list() for i in range(self.support): # A * X supports.append(K.dot(basis[i], features)) # 将多个图的结果按行拼接 supports = K.concatenate(supports, axis=1) # A * X * W output = K.dot(supports, self.kernel) if self.bias: # A * X * W + b output += self.bias return self.activation(output) # 定义当前层的配置信息 def get_config(self): config = {'units': self.units, 'support': self.support, 'activation': activations.serialize(self.activation), 'use_bias': self.use_bias, 'kernel_initializer': initializers.serialize( self.kernel_initializer), 'bias_initializer': initializers.serialize( self.bias_initializer), 'kernel_regularizer': regularizers.serialize( self.kernel_regularizer), 'bias_regularizer': regularizers.serialize( self.bias_regularizer), 'activity_regularizer': regularizers.serialize( self.activity_regularizer), 'kernel_constraint': constraints.serialize( self.kernel_constraint), 'bias_constraint': constraints.serialize(self.bias_constraint) } base_config = super(GraphConvolution, self).get_config() return dict(list(base_config.items()) + list(config.items()))
4 train.py
from __future__ import print_function from keras.layers import Input, Dropout from keras.models import Model from keras.optimizers import Adam from keras.regularizers import l2 from kegra.layers.graph import GraphConvolution from kegra.utils import * import time # 超参数 # Define parameters DATASET = 'cora' # 过滤器 FILTER = 'localpool' # 'chebyshev' # 最大多项式的度 MAX_DEGREE = 2 # maximum polynomial degree # 是否对称正则化 SYM_NORM = True # symmetric (True) vs. left-only (False) normalization # 迭代次数 NB_EPOCH = 20000 # 提前停止参数 PATIENCE = 10 # early stopping patience # 加载数据 # Get data X, A, y = load_data(dataset=DATASET) # 特征、邻接矩阵、标签 # 训练集样本标签、验证集样本标签、测试集样本标签、训练集索引列表 # 验证集索引列表、测试集索引列表、训练数据的样本掩码 y_train, y_val, y_test, idx_train, idx_val, idx_test, train_mask = get_splits(y) # 对特征进行归一化处理 # Normalize X X /= X.sum(1).reshape(-1, 1) # 当过滤器为局部池化过滤器时 if FILTER == 'localpool': """ Local pooling filters (see 'renormalization trick' in Kipf & Welling, arXiv 2016) """ print('Using local pooling filters...') # 加入自连接的邻接矩阵 A_ = preprocess_adj(A, SYM_NORM) support = 1 # 特征矩阵和邻接矩阵 graph = [X, A_] G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True)] # 当过滤器为切比雪夫多项式时 elif FILTER == 'chebyshev': """ Chebyshev polynomial basis filters (Defferard et al., NIPS 2016) """ print('Using Chebyshev polynomial basis filters...') # 对拉普拉斯矩阵进行归一化处理,得到对称规范化的拉普拉斯矩阵 L = normalized_laplacian(A, SYM_NORM) # 重新调整对称归一化的图拉普拉斯矩阵,得到其简化版本 L_scaled = rescale_laplacian(L) # 计算直到MAX_DEGREE阶的切比雪夫多项式 T_k = chebyshev_polynomial(L_scaled, MAX_DEGREE) # support = MAX_DEGREE + 1 # 特征矩阵、直到MAX_DEGREE阶的切比雪夫多项式列表 graph = [X]+T_k # 列表相加 G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True) for _ in range(support)] else: raise Exception('Invalid filter type.') # shape为形状元组,不包括batch_size # 例如shape=(32, )表示预期的输入将是一批32维的向量 X_in = Input(shape=(X.shape[1],)) # 定义模型架构 # 注意:我们将图卷积网络的参数作为张量列表传递 # 更优雅的做法需要重写Layer基类 # Define model architecture # NOTE: We pass arguments for graph convolutional layers as a list of tensors. # This is somewhat hacky, more elegant options would require rewriting the Layer base class. H = Dropout(0.5)(X_in) H = GraphConvolution(16, support, activation='relu', kernel_regularizer=l2(5e-4))([H]+G) H = Dropout(0.5)(H) Y = GraphConvolution(y.shape[1], support, activation='softmax')([H]+G) # 编译模型 # Compile model model = Model(inputs=[X_in]+G, outputs=Y) model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01)) # 训练过程中的辅助变量 # Helper variables for main training loop wait = 0 preds = None best_val_loss = 99999 # 训练模型 # Fit for epoch in range(1, NB_EPOCH+1): # 统计系统时钟的时间戳 # Log wall-clock time t = time.time() # 每一次迭代过程 # Single training iteration (we mask nodes without labels for loss calculation) model.fit(graph, y_train, sample_weight=train_mask, # 向sample_weight参数传递train_mask用于样本掩码 batch_size=A.shape[0], epochs=1, shuffle=False, verbose=0) # 预测模型在整个数据集上的输出 # Predict on full dataset preds = model.predict(graph, batch_size=A.shape[0]) # 模型在验证集上的损失和准确率 # Train / validation scores train_val_loss, train_val_acc = evaluate_preds(preds, [y_train, y_val], [idx_train, idx_val]) print("Epoch: {:04d}".format(epoch), "train_loss= {:.4f}".format(train_val_loss[0]), # 在训练集上的损失 "train_acc= {:.4f}".format(train_val_acc[0]), # 在训练集上的准确率 "val_loss= {:.4f}".format(train_val_loss[1]), # 在验证集上的损失 "val_acc= {:.4f}".format(train_val_acc[1]), # 在验证集上的准确率 "time= {:.4f}".format(time.time() - t)) # 本次迭代的运行时间 # 提取停止 # Early stopping if train_val_loss[1] < best_val_loss: best_val_loss = train_val_loss[1] wait = 0 else: # 当模型在测试集上的损失连续10次迭代没有优化时,则提取停止 if wait >= PATIENCE: print('Epoch {}: early stopping'.format(epoch)) break wait += 1 # 模型在测试集上的损失和准确率 # Testing test_loss, test_acc = evaluate_preds(preds, [y_test], [idx_test]) print("Test set results:", "loss= {:.4f}".format(test_loss[0]), "accuracy= {:.4f}".format(test_acc[0]))
原文:https://blog.csdn.net/tszupup/article/details/89004637