Graph Attention Networks (GAT) 代码解读

Graph Attention Networks (GAT) 代码解读

1.1 代码结构

.
|--- data 		    # Cora数据集
|--- models		    # GAT模型定义(gat.py)
|--- pre_trained	# 预训练的模型
|--- utils		    # 工具定义

1.2 参数设置

GAT/execute_cora.py

# training params
batch_size = 1
nb_epochs = 100000
patience = 100
lr = 0.005  # learning rate
l2_coef = 0.0005  # weight decay
hid_units = [8] # numbers of hidden units per each attention head in each layer
n_heads = [8, 1] # additional entry for the output layer
residual = False
nonlinearity = tf.nn.elu
model = GAT

1.3 导入数据

GAT源码默认使用的Cora数据集。Cora的相关代码介绍可以参考这里

数据预处理部分和GCN源码相同,可以参考这里

最终载入的数据adj为邻接矩阵,表示2708篇文章之间的索引关系。features表示1433个单词在2708篇文章中是否存在。

GAT/utils/process.py

def load_data(dataset_str):
    # ...
    print(adj.shape) # (2708, 2708)
    print(features.shape) #(2708, 1433)

1.4 特征预处理

GAT/utils/process.py

def preprocess_features(features):
    """Row-normalize feature matrix and convert to tuple representation"""
    rowsum = np.array(features.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)

    features = r_mat_inv.dot(features)
    return features.todense(), sparse_to_tuple(features)

1.5 模型定义-前向传播

添加一层单头注意力

'''
输入是(B,N,D),B是batch size,N是节点数,D是每个节点的原始特征维数
输出是(B,N,F),F是每个节点的新特征维数
每个节点从维度D到维度F是按注意力为权重聚合了邻居节点的特征
'''

def att_head(seq, out_sz, bias_mat, activation, in_drop = 0.0, coef_drop = 0.0, residual = False):
    '''
    seq:输入(B,N,D),B是batch size,N是节点数,D是每个节点的原始特征维数
    out_sz:每个节点的输出特征维数,设为F
    bias_mat:(N,N)掩码矩阵
    activation:激活函数
    in_drop:输入的dropout率
    coef_drop:注意力矩阵的dropout率
    residual:是否使用残差网络
    '''
    
    with tf.name_scope('my_attn'):
        # drop out 防止过拟合;如果为0则不设置该层
        if in_drop != 0.0:
            seq = tf.nn.dropout(seq, 1.0 - in_drop)
        
        '''
        为了获得足够的表达能力以将输入特征转化为高级特征,需要至少一种可学习的线性变换。为此,作为第一步,
        我们学习一个W矩阵用于投影特征
        实现公式seq_fts = Wh,即每个节点的维度变换
        '''
        
        # F2F'
        seq_fts = tf.keras.layers.Conv1D(seq, out_sz, 1, use_bias=False)

        '''
        实现公式 f_1 = a(Whi); f_2 = a(Whj)
        f_1+f_2的转置实现了logits = eij = a(Whi) + a(Whj)
        eij经过激活,softmax得到论文中的aij,即点i对点j的注意力
        bias_mat是为了让非互为邻居的注意力不要j进入softmax的计算
        只有互为邻居的注意力才能进入softmax,从而保证了注意力在局部
        '''
        
        # (B, N, F) => (B, N, 1)
        f_1 = tf.keras.layers.Conv1D(seq_fts, 1, 1)
        # (B, N, F) => (B, N, 1)
        f_2 = tf.keras.layers.Conv1D (seq_fts, 1, 1)
        
        # (B, N, 1) + (B, N, 1) = (B, N, N)
        # logits 即 eij
        logits = f_1 + tf.transpose(f_2, [0, 2, 1])
        # (B, N, N) + (1, N, N) => (B, N, N) => softmax => (B, N, N)
        # 这里运用了 tensorflow 的广播机制
        # 得到的logits 并不是一个对角矩阵, 这是因为 f_1 和 f_2并非同一个参数 a
        # logits{i,j} 等于 a1(Whi) + a2(Whj)
        
        # 注意力系数矩阵coefs=(aij)_{N*N}
        # bias_mat 体现 mask 思想, 保留了图的结构信息, 

        
        coefs = tf.nn.softmax(tf.nn.leaky_relu(logits) + bias_mat)
        
        # 输入矩阵、注意力系数矩阵的dropout操作
        if coef_drop != 0.0:
            coefs = tf.nn.dropout(coefs, 1.0 - coef_drop)
        if in_drop != 0.0:
            seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop)
            
        '''
        实现 hi = sum(aijWhj)
        即节点i根据注意力聚合邻居特征
        '''
        
        # (B, N, N) * (B, N, F) => (B, N, F)
        vals = tf.matmul(coefs, seq_fts)
        
        
        
        # 添加偏置项
        ret = tf.contrib.layers.bias_add(vals)
        
        '''
        添加残差连接后,激活
        如果输入(B, N, D)和聚合了节点特征的输出(B, N, F)的最后一个维度相同,则直接相加
        否则将(B, N, D)线性变换为(B, N, F) 再相加
        '''
        
        # residual connection
        if residual:
            # D != F
            if seq.shape[-1] != ret.shape[-1]:
                ret = ret + conv1d(seq, ret.shape[-1], 1) # activation
            else:
                ret = ret + seq
        
        return activation(ret) # activation

模型定义

class BaseGAttN:
    def loss(logits, labels, nb_classes, class_weights):
        sample_wts = tf.reduce_sum(tf.multiply(tf.one_hot(labels, nb_classes), class_weights), axis=-1)
        xentropy = tf.multiply(tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=labels, logits=logits), sample_wts)
        return tf.reduce_mean(xentropy, name='xentropy_mean')
    
    def training(loss, lr, l2_coef):
        # weight decay
        vars = tf.trainable_variables()
        lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in vars if v.name not in ['bias', 'gamma', 'b', 'g', 'beta']] * l2_coef)
        
        # optimizer 
        opt = tf.train.AdamOptimizer(learning_rate = lr)
        
        # training op
        train_op = opt.minimize(loss + lossL2)
        
        return train_op
    
    
    def masked_softmax_cross_entropy(logits, labels, mask):
        '''
        Softmax cross-entropy loss with masking.
        logits: 模型的输出,维度(B, C); B是样本量, C是输出维度
        labels: 模型的标签,维度(B, C)
        mask: 掩码,维度(B, )
        '''
        
        # logits 先用softmax转化为概率分布,再和labelsj计算交叉熵
        # loss 维度是(B,)
        loss = tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = labels)
        
        # 将数据类型转化为 tf.float32
        mask = tf.cast(mask, dtype = tf.float32)
        
        # 将mask值归一化
        mask /= tf.reduce_mean(mask)
        
        # 屏蔽掉某些样本的损失
        loss *= mask
        
        # 返回均值损失
        return tf.reduce_mean(loss)
    
    
    def masked_sigmoid_cross_entropy(logits, labels, mask):
        '''
        Softmax cross-entropy loss with masking.
        logits:(B, C), 模型输出; B是样本量,C是输出维度
        labels:(B, C), 真实标签
        mask: 掩码,维度(B,)
        '''
        labels = tf.cast(mask, dtype = tf.float32)
        # loss 维度是(B,)
        loss = tf.nn.sigmoid_cross_entropy_with_logits(logits = logits, labels = labels)
        # (B,C) =>(B,)
        loss = tf.reduce_mean(loss, axis = 1)
        
        mask /= tf.reduce_mean(mask)
        loss *= mask
        
        return tf.reduce_mean(loss)
    
    def masked_accuracy(logits, labels, mask):
        '''
        Accuracy with masking
        logits:(B, C), 模型输出; B是样本量, C是输出维度
        labels:(B, C), 真实标签
        mask: 掩码,维度(B,)
        '''
        
        # 计算预测值和真实值的索引相同,则预测正确
        correct_prediction = tf.equal( tf.argmax(logits, 1), tf.argmax(labels, 1) )
        accuracy_all = tf.cast( correct_prediction, tf.float32 )
        mask = tf.cast( mask, dtype = tf.float32 )
        mask /= tf.reduce_mean(mask)
        accuracy_all *= mask
        return tf.reduce_mean(accuracy_all)
    
    
#%%
class GAT(BaseGAttN):
    
    def inference(inputs, nb_classes, nb_nodes, training, attn_drop, ffd_drop, bias_mat,
                  hid_mat, hid_units, n_heads, activation = tf.nn.elu, residual = False):
        '''
        inputs:(B,N,D), B是batch size, N是节点数, D是每个节点的原始特征维数
        nb_classes: 分类任务的类别数, 设为C
        nb_nodes: 节点个数,设为N
        training: 标志'训练阶段', '测试阶段'
        attn_drop: 注意力矩阵dropout率,防止过拟合
        ffd_drop: 输入的dropout率,防止过拟合
        bias_mat: 一个(N, N)矩阵,由邻接矩阵A变化而来,是注意力矩阵的掩码
        hid_units: 列表, 第i个元素是第i层的每个注意力头的隐藏单元数
        n_heads: 列表, 第i个元素是第i层的注意力头数
        activation: 激活函数
        resudial: 是否采用残差连接
        '''
        
        
        '''
        第一层,由H1个注意力头,每个头的输入都是(B, N, D), 每个头的注意力输出都是(B, N, F1)
        将所有注意力头的输出聚合, 聚合为(B, N, F1*H1)
        '''
        attns = []
        # n_heads[0] = 第一层注意力头数, 设为 H1
        for i in range(n_heads[0]):
            attns.append(
                    attn_head(inputs, bias_mat = bias_mat, 
                              out_sz = hid_units[0], activation = activatoin,
                              in_drop = ffd_drop, coef_drop = attn_drop, residual = False)
                    ) 
                    
        # [(B, N, F1), (B, N, F1)..] => (B, N, F1 * H1)
        
        h_1 = tf.concat(attns, axis = -1) # 连接上一层
        
        '''
        中间层,层数是 len(hid_units)-1;
        第i层有Hi个注意力头,输入是(B, N, F1*H1),每头注意力输出是(B, N, F1);
        每层均聚合所有头的注意力, 得到(B, N, Fi * Hi)
        '''
        # len(hid_units) = 中间层的个数
        for i in range(1, len(hid_units)):
            h_old = h_1 # 未使用
            attns = []
            # n_heads[i] = 中间第i层的注意力头数,设为Hi
            for _ in range(n_heads[i]):
                attns.append(
                        attn_head(h_1, bias_mat = bias_mat,
                                  out_sz = hid_units[i], activation = activation,
                                  in_drop = ffd_drop, coef_drop = attn_drop, residual = residual)
                        )
            
            # [(B, N, Fi), (B, N, Fi) ..] => (B, N, Fi*Hi)
            h_1 = tf.concat(attns, axis = -1) # 连接上一层
        
        '''
        最后一层,共有n_heads[-1]个注意力,一般为1
        输入: 最后一层的输出为(B, N, Fi*Hi)
        输出: (B, N, C), C是分类任务数
        输出:
        '''
        
        
        out = []
        for i in range(n_heads[-1]):
            out.append(
                    attn_head(h_1, bias_mat = bias_mat, 
                              out_sz = nb_classes, activation = lambda x : x,
                              in_  drop = ffd_drop, coef_drop = attn_drop, residual = False   )
                    )
        
        # 将多头注意力相加取平均
        logits = tf.add_n(out) / n_heads[-1]
        
        return logits
    
posted @ 2021-03-17 21:15  popozyl  阅读(4353)  评论(0编辑  收藏  举报