Graph Attention Networks (GAT) 代码解读
Graph Attention Networks (GAT) 代码解读
1.1 代码结构
.
|--- data # Cora数据集
|--- models # GAT模型定义(gat.py)
|--- pre_trained # 预训练的模型
|--- utils # 工具定义
1.2 参数设置
GAT/execute_cora.py
# training params
batch_size = 1
nb_epochs = 100000
patience = 100
lr = 0.005 # learning rate
l2_coef = 0.0005 # weight decay
hid_units = [8] # numbers of hidden units per each attention head in each layer
n_heads = [8, 1] # additional entry for the output layer
residual = False
nonlinearity = tf.nn.elu
model = GAT
1.3 导入数据
GAT源码默认使用的Cora数据集。Cora的相关代码介绍可以参考这里
数据预处理部分和GCN源码相同,可以参考这里
最终载入的数据adj为邻接矩阵,表示2708篇文章之间的索引关系。features表示1433个单词在2708篇文章中是否存在。
GAT/utils/process.py
def load_data(dataset_str):
# ...
print(adj.shape) # (2708, 2708)
print(features.shape) #(2708, 1433)
1.4 特征预处理
GAT/utils/process.py
def preprocess_features(features):
"""Row-normalize feature matrix and convert to tuple representation"""
rowsum = np.array(features.sum(1))
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.
r_mat_inv = sp.diags(r_inv)
features = r_mat_inv.dot(features)
return features.todense(), sparse_to_tuple(features)
1.5 模型定义-前向传播
添加一层单头注意力
'''
输入是(B,N,D),B是batch size,N是节点数,D是每个节点的原始特征维数
输出是(B,N,F),F是每个节点的新特征维数
每个节点从维度D到维度F是按注意力为权重聚合了邻居节点的特征
'''
def att_head(seq, out_sz, bias_mat, activation, in_drop = 0.0, coef_drop = 0.0, residual = False):
'''
seq:输入(B,N,D),B是batch size,N是节点数,D是每个节点的原始特征维数
out_sz:每个节点的输出特征维数,设为F
bias_mat:(N,N)掩码矩阵
activation:激活函数
in_drop:输入的dropout率
coef_drop:注意力矩阵的dropout率
residual:是否使用残差网络
'''
with tf.name_scope('my_attn'):
# drop out 防止过拟合;如果为0则不设置该层
if in_drop != 0.0:
seq = tf.nn.dropout(seq, 1.0 - in_drop)
'''
为了获得足够的表达能力以将输入特征转化为高级特征,需要至少一种可学习的线性变换。为此,作为第一步,
我们学习一个W矩阵用于投影特征
实现公式seq_fts = Wh,即每个节点的维度变换
'''
# F2F'
seq_fts = tf.keras.layers.Conv1D(seq, out_sz, 1, use_bias=False)
'''
实现公式 f_1 = a(Whi); f_2 = a(Whj)
f_1+f_2的转置实现了logits = eij = a(Whi) + a(Whj)
eij经过激活,softmax得到论文中的aij,即点i对点j的注意力
bias_mat是为了让非互为邻居的注意力不要j进入softmax的计算
只有互为邻居的注意力才能进入softmax,从而保证了注意力在局部
'''
# (B, N, F) => (B, N, 1)
f_1 = tf.keras.layers.Conv1D(seq_fts, 1, 1)
# (B, N, F) => (B, N, 1)
f_2 = tf.keras.layers.Conv1D (seq_fts, 1, 1)
# (B, N, 1) + (B, N, 1) = (B, N, N)
# logits 即 eij
logits = f_1 + tf.transpose(f_2, [0, 2, 1])
# (B, N, N) + (1, N, N) => (B, N, N) => softmax => (B, N, N)
# 这里运用了 tensorflow 的广播机制
# 得到的logits 并不是一个对角矩阵, 这是因为 f_1 和 f_2并非同一个参数 a
# logits{i,j} 等于 a1(Whi) + a2(Whj)
# 注意力系数矩阵coefs=(aij)_{N*N}
# bias_mat 体现 mask 思想, 保留了图的结构信息,
coefs = tf.nn.softmax(tf.nn.leaky_relu(logits) + bias_mat)
# 输入矩阵、注意力系数矩阵的dropout操作
if coef_drop != 0.0:
coefs = tf.nn.dropout(coefs, 1.0 - coef_drop)
if in_drop != 0.0:
seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop)
'''
实现 hi = sum(aijWhj)
即节点i根据注意力聚合邻居特征
'''
# (B, N, N) * (B, N, F) => (B, N, F)
vals = tf.matmul(coefs, seq_fts)
# 添加偏置项
ret = tf.contrib.layers.bias_add(vals)
'''
添加残差连接后,激活
如果输入(B, N, D)和聚合了节点特征的输出(B, N, F)的最后一个维度相同,则直接相加
否则将(B, N, D)线性变换为(B, N, F) 再相加
'''
# residual connection
if residual:
# D != F
if seq.shape[-1] != ret.shape[-1]:
ret = ret + conv1d(seq, ret.shape[-1], 1) # activation
else:
ret = ret + seq
return activation(ret) # activation
模型定义
class BaseGAttN:
def loss(logits, labels, nb_classes, class_weights):
sample_wts = tf.reduce_sum(tf.multiply(tf.one_hot(labels, nb_classes), class_weights), axis=-1)
xentropy = tf.multiply(tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=labels, logits=logits), sample_wts)
return tf.reduce_mean(xentropy, name='xentropy_mean')
def training(loss, lr, l2_coef):
# weight decay
vars = tf.trainable_variables()
lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in vars if v.name not in ['bias', 'gamma', 'b', 'g', 'beta']] * l2_coef)
# optimizer
opt = tf.train.AdamOptimizer(learning_rate = lr)
# training op
train_op = opt.minimize(loss + lossL2)
return train_op
def masked_softmax_cross_entropy(logits, labels, mask):
'''
Softmax cross-entropy loss with masking.
logits: 模型的输出,维度(B, C); B是样本量, C是输出维度
labels: 模型的标签,维度(B, C)
mask: 掩码,维度(B, )
'''
# logits 先用softmax转化为概率分布,再和labelsj计算交叉熵
# loss 维度是(B,)
loss = tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels)
# 将数据类型转化为 tf.float32
mask = tf.cast(mask, dtype = tf.float32)
# 将mask值归一化
mask /= tf.reduce_mean(mask)
# 屏蔽掉某些样本的损失
loss *= mask
# 返回均值损失
return tf.reduce_mean(loss)
def masked_sigmoid_cross_entropy(logits, labels, mask):
'''
Softmax cross-entropy loss with masking.
logits:(B, C), 模型输出; B是样本量,C是输出维度
labels:(B, C), 真实标签
mask: 掩码,维度(B,)
'''
labels = tf.cast(mask, dtype = tf.float32)
# loss 维度是(B,)
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits = logits, labels = labels)
# (B,C) =>(B,)
loss = tf.reduce_mean(loss, axis = 1)
mask /= tf.reduce_mean(mask)
loss *= mask
return tf.reduce_mean(loss)
def masked_accuracy(logits, labels, mask):
'''
Accuracy with masking
logits:(B, C), 模型输出; B是样本量, C是输出维度
labels:(B, C), 真实标签
mask: 掩码,维度(B,)
'''
# 计算预测值和真实值的索引相同,则预测正确
correct_prediction = tf.equal( tf.argmax(logits, 1), tf.argmax(labels, 1) )
accuracy_all = tf.cast( correct_prediction, tf.float32 )
mask = tf.cast( mask, dtype = tf.float32 )
mask /= tf.reduce_mean(mask)
accuracy_all *= mask
return tf.reduce_mean(accuracy_all)
#%%
class GAT(BaseGAttN):
def inference(inputs, nb_classes, nb_nodes, training, attn_drop, ffd_drop, bias_mat,
hid_mat, hid_units, n_heads, activation = tf.nn.elu, residual = False):
'''
inputs:(B,N,D), B是batch size, N是节点数, D是每个节点的原始特征维数
nb_classes: 分类任务的类别数, 设为C
nb_nodes: 节点个数,设为N
training: 标志'训练阶段', '测试阶段'
attn_drop: 注意力矩阵dropout率,防止过拟合
ffd_drop: 输入的dropout率,防止过拟合
bias_mat: 一个(N, N)矩阵,由邻接矩阵A变化而来,是注意力矩阵的掩码
hid_units: 列表, 第i个元素是第i层的每个注意力头的隐藏单元数
n_heads: 列表, 第i个元素是第i层的注意力头数
activation: 激活函数
resudial: 是否采用残差连接
'''
'''
第一层,由H1个注意力头,每个头的输入都是(B, N, D), 每个头的注意力输出都是(B, N, F1)
将所有注意力头的输出聚合, 聚合为(B, N, F1*H1)
'''
attns = []
# n_heads[0] = 第一层注意力头数, 设为 H1
for i in range(n_heads[0]):
attns.append(
attn_head(inputs, bias_mat = bias_mat,
out_sz = hid_units[0], activation = activatoin,
in_drop = ffd_drop, coef_drop = attn_drop, residual = False)
)
# [(B, N, F1), (B, N, F1)..] => (B, N, F1 * H1)
h_1 = tf.concat(attns, axis = -1) # 连接上一层
'''
中间层,层数是 len(hid_units)-1;
第i层有Hi个注意力头,输入是(B, N, F1*H1),每头注意力输出是(B, N, F1);
每层均聚合所有头的注意力, 得到(B, N, Fi * Hi)
'''
# len(hid_units) = 中间层的个数
for i in range(1, len(hid_units)):
h_old = h_1 # 未使用
attns = []
# n_heads[i] = 中间第i层的注意力头数,设为Hi
for _ in range(n_heads[i]):
attns.append(
attn_head(h_1, bias_mat = bias_mat,
out_sz = hid_units[i], activation = activation,
in_drop = ffd_drop, coef_drop = attn_drop, residual = residual)
)
# [(B, N, Fi), (B, N, Fi) ..] => (B, N, Fi*Hi)
h_1 = tf.concat(attns, axis = -1) # 连接上一层
'''
最后一层,共有n_heads[-1]个注意力,一般为1
输入: 最后一层的输出为(B, N, Fi*Hi)
输出: (B, N, C), C是分类任务数
输出:
'''
out = []
for i in range(n_heads[-1]):
out.append(
attn_head(h_1, bias_mat = bias_mat,
out_sz = nb_classes, activation = lambda x : x,
in_ drop = ffd_drop, coef_drop = attn_drop, residual = False )
)
# 将多头注意力相加取平均
logits = tf.add_n(out) / n_heads[-1]
return logits
---- suffer now and live the rest of your life as a champion ----