Loading

7-4DeepFM模型

推荐系统和广告CTR预估主流模型的演化有两条主要路线。

第一条是显式建模特征交互,提升模型对交叉特征的捕获能力。(如Wide&Deep,PNN,FNN,DCN,DeepFM,AutoInt等)

第二条是加入注意力机制,提升模型的自适应能力和解释性。(如DIN,DIEN,DSIN,FiBiNET,AutoInt等)

在所有这些模型中,DeepFM属于性价比非常高的模型(结构简洁,计算高效,指标有竞争力)。

张俊林大佬 在2019年的时候甚至建议 沿着 LR->FM->DeepFM->干点别的 这样的路线去迭代推荐系统。

参考文档:

import torch
import torchkeras
print("torch.__version__ = ", torch.__version__)
print("torchkeras.__version__ = ", torchkeras.__version__) 

"""
torch.__version__ =  2.3.1+cu121
torchkeras.__version__ =  3.9.6
"""

1.DeepFM原理解析

DeepFM继承了Wide&Deep的主体结构,将高低特征进行融合。

其主要创新点有2个。

一是将Wide部分替换成了 FM结构,以更有效的捕获特征交互interaction.

二是FM中的隐向量 和 Deep部分的 embedding 向量共享权重,减少模型复杂性。

2.DeepFM的pytorch实现

下面是一个DeepFM的Pytorch实现。

除了添加了一个并行的MLP模块用于捕获隐式高阶交叉和组合特征外,其余结构基本和FM的实现完全一致。

import torch
from torch import nn, Tensor
import torch.nn.functional as F


class NumEmbedding(nn.Module):
    """
    连续特征用linear层编码
    输入shape: [batch_size, feature_number(n), d_in],  # d_in通常是1
    输出shape: [batch_size, feature_number(n), d_out]
    """
    def __init__(self, n: int, d_in: int, d_out: int, bias: bool = False) -> None:
        super().__init__()
        self.weight = nn.Parameter(Tensor(n, d_in, d_out))
        self.bias = nn.Parameter(Tensor(n, d_out)) if bias else None
        with torch.no_grad():
            for i in range(n):
                layer = nn.Linear(d_in, d_out)
                self.weight[i] = layer.weight.T
                if self.bias is not None:
                    self.bias[i] = layer.bias

    def forward(self, x_num):
        # x_num: batch_size, features_num, d_in
        assert x_num.ndim == 3
        # x = x_num[..., None] * self.weight[None]
        # x = x.sum(-2)
        x = torch.einsum("bfi,fij->bfj", x_num, self.weight)
        if self.bias is not None:
            x = x + self.bias[None]
        return x


class CatEmbedding(nn.Module):
    """
    离散特征用Embedding层编码。
    输入shape: [batch_size, feature_num],
    输出shape: [batch_size, feature_num, d_embed]
    """
    def __init__(self, categories, d_embed):
        super().__init__()
        self.embedding = torch.nn.Embedding(sum(categories), d_embed)
        # 这段代码的作用是创建一个不可训练的参数(nn.Parameter)self.offsets,并初始化为一个张量,这个张量用于存储类别的偏移量(offsets)。
        self.offsets = nn.Parameter(torch.tensor([0] + categories[:-1]).cumsum(0), requires_grad=False)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data)

    def forward(self, x_cat):
        """
        :param x_cat: Long tensor of (batch_size, feature_num)
        """
        # 通过将类别索引张量 x_cat 与 self.offsets 相加,调整类别索引,使其正确对应到 self.embedding 中的实际类别
        x = x_cat + self.offsets[None]
        return self.embedding(x)


class CatLinear(nn.Module):
    """
    离散特征用Embedding实现线性层(等价于先F.onehot再nn.Linear())
    输入shape: [batch_size, features_num]
    输出shape: [batch_size, features_num, d_out]
    """
    def __init__(self, categories, d_out=1):
        super().__init__()
        self.fc = nn.Embedding(sum(categories), d_out)
        self.bias = nn.Parameter(torch.zeros((d_out, )))
        self.offsets = nn.Parameter(torch.tensor([0]+categories[:-1]).cumsum(0), requires_grad=False)

    def forward(self, x_cat):
        """
        :param x: Long tensor of size (batch_size, features_num)
        """
        x = x_cat + self.offsets[None]
        return torch.sum(self.fc(x), dim=1) + self.bias


class FMLayer(nn.Module):
    """
    FM交互项
    """
    def __init__(self, reduce_sum=True):
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self, x):  # 注意 这里的x是公式中的<v_i> * xi
        """
        :param x: Float tensor of size (batch_size, num_features, k)
        """
        square_of_sum = torch.sum(x, dim=1) ** 2
        sum_of_square = torch.sum(x ** 2, dim=1)
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix


# deep部分
class MultiLayerPerceptron(nn.Module):
    def __init__(self, d_in, d_layers, dropout, d_out=1):
        super().__init__()
        layers = []
        for d in d_layers:
            layers.append(nn.Linear(d_in, d))
            layers.append(nn.BatchNorm1d(d))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(p=dropout))
            d_in = d
        layers.append(nn.Linear(d_layers[-1], d_out))
        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        """
        :param x: Float tensor of size (batch_size, d_in)
        """
        return self.mlp(x)


class DeepFM(nn.Module):
    """
    DeepFM模型
    """
    def __init__(self, d_numerical, categories, d_embed, deep_layers, deep_dropout, n_classes=1):
        super().__init__()
        if d_numerical is None:
            d_numerical = 0
        if categories is None:
            categories = []

        self.categories = categories
        self.n_classes = n_classes
        self.num_linear = nn.Linear(d_numerical, n_classes) if d_numerical else None
        self.cat_linear = CatLinear(categories, n_classes) if categories else None
        self.num_embedding = NumEmbedding(d_numerical, 1, d_embed) if d_numerical else None
        self.cat_embedding = CatEmbedding(categories, d_embed) if categories else None

        """
        FM 的主要作用是捕捉特征之间的交互效应。在二分类问题中,输出一个单一的值(例如,用于二元交叉熵损失)足以描述模型的预测。
        当 reduce_sum=True 时,FM 层会将所有特征的交互结果相加,从而得到一个标量输出。这适合二分类,因为我们只需要一个输出值来进行分类。
        输出的标量可以直接用于后续的激活函数(如 Sigmoid),从而将其转换为概率值。
        在二分类的场景下,引入一个额外的线性层(self.fm_linear)可能会增加模型的复杂性,但对最终输出没有实质性帮助。因此,省略这个层能够简化模型结构。
        单输出:对于二分类,通常最后只需要一个输出(例如,预测为正类的概率),因此 self.fm 的输出直接作为最终预测的输入。
        多输出:在多分类的情况下(n_classes >= 2),需要多个输出(每个类一个),因此需要一个线性层(self.fm_linear)来生成每个类的预测。
        """
        if n_classes == 1:
            self.fm = FMLayer(reduce_sum=True)
            self.fm_linear = None
        else:
            assert n_classes >= 2
            self.fm = FMLayer(reduce_sum=False)
            self.fm_linear = nn.Linear(d_embed, n_classes)

        # 包含数值特征的嵌入和分类特征的嵌入的总和。
        # (数值特征的数量×每个数值特征的嵌入维度)+(类别特征的数量×每个类别特征的嵌入维度)
        self.deep_in = d_numerical * d_embed + len(categories) * d_embed
        self.deep = MultiLayerPerceptron(
            d_in=self.deep_in,
            d_layers=deep_layers,
            dropout=deep_dropout,
            d_out=n_classes
        )

    def forward(self, x):
        """
        :param x_num: numerical features
        :param x_cat: categorical features
        """
        x_num, x_cat = x

        # linear部分
        x = 0.0
        if self.num_linear:
            x = x + self.num_linear(x_num)
        if self.cat_linear:
            x = x + self.cat_linear(x_cat)

        # fm部分
        x_embedding = []
        if self.num_embedding:
            # [..., None]: 这个操作用于增加一个维度,使 x_num 的形状变为 (batch_size, d_numerical, 1)。
            # 在 PyTorch 中,这种方式通常用于将一维张量转换为二维或三维张量,以便与嵌入层的输入形状匹配。
            x_embedding.append(self.num_embedding(x_num[..., None]))
        if self.cat_embedding:
            x_embedding.append(self.cat_embedding(x_cat))
        x_embedding = torch.cat(x_embedding, dim=1)
        if self.n_classes == 1:
            x = x + self.fm(x_embedding)
        else:
            x = x+ self.fm_linear(self.fm(x_embedding))

        # deep 部分
        x = x + self.deep(x_embedding.view(-1, self.deep_in))
        if self.n_classes == 1:
            x = x.squeeze(-1)
        return x
# 测试DeepFM
model = DeepFM(d_numerical=3, categories=[4, 3, 2], d_embed=4, deep_layers=[20, 20], deep_dropout=0.1, n_classes=1)
x_num = torch.randn(2, 3)
x_cat = torch.randint(0, 2, (2, 3))
model((x_num, x_cat))

"""
tensor([-0.0014, -0.1834], grad_fn=<SqueezeBackward1>)
"""

3.criteo数据集完整范例

import numpy as np
import pandas as pd
import datetime
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torchkeras

# 准备数据
from sklearn.preprocessing import LabelEncoder, QuantileTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# dfdata = pd.read_csv('./dataset/cretio/cretio_small_small.csv', sep='\t', header=None)
dfdata = pd.read_csv('./dataset/cretio/dac_sample.txt', sep='\t', header=None)
dfdata.columns = ["label"] + ["I"+str(x) for x in range(1, 14)] + ["C"+str(x) for x in range(14, 40)]

cat_cols = [x for x in dfdata.columns if x.startswith('C')]
num_cols = [x for x in dfdata.columns if x.startswith('I')]
num_pipe = Pipeline(steps=[("impute", SimpleImputer()), ("quantile", QuantileTransformer())])  # 用于将数据的分布转化为均匀分布或正态分布

for col in cat_cols:
    dfdata[col] = LabelEncoder().fit_transform(dfdata[col])
dfdata[num_cols] = num_pipe.fit_transform(dfdata[num_cols])
categories = [dfdata[col].max()+1 for col in cat_cols]

import torch
from torch.utils.data import Dataset, DataLoader

# 将DataFrame转换成数据集Dataset特征分割成X_num, X_cat方式
class DfDataset(Dataset):
    def __init__(self, df, label_col, num_features, cat_features, categories, is_training=True):
        self.X_num = torch.tensor(df[num_features].values).float() if num_features else None
        self.X_cat = torch.tensor(df[cat_features].values).long() if cat_features else None
        self.Y = torch.tensor(df[label_col].values).float()
        self.categories = categories
        self.is_training = is_training

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, index):
        if self.is_training:
            return ((self.X_num[index], self.X_cat[index]), self.Y[index])
        else:
            return (self.X_num[index], self.X_cat[index])

    def get_categories(self):
        return self.categories
    
dftrain_val, dftest = train_test_split(dfdata, test_size=0.2)
dftrain, dfval = train_test_split(dftrain_val, test_size=0.2)
ds_train = DfDataset(dftrain,label_col = "label",num_features = num_cols,cat_features = cat_cols,
                    categories = categories, is_training=True)

ds_val = DfDataset(dfval,label_col = "label",num_features = num_cols,cat_features = cat_cols,
                    categories = categories, is_training=True)

ds_test = DfDataset(dftest,label_col = "label",num_features = num_cols,cat_features = cat_cols,
                    categories = categories, is_training=True)

dl_train = DataLoader(ds_train, batch_size=2048, shuffle=True)
dl_val = DataLoader(ds_val,batch_size = 2048,shuffle=False)
dl_test = DataLoader(ds_test,batch_size = 2048,shuffle=False)

for features,labels in dl_train:
    break 
    
# 定义模型
def create_net():
    net = DeepFM(d_numerical=ds_train.X_num.shape[1], categories=ds_train.get_categories(),
                d_embed=8, deep_layers=[128, 64, 32], deep_dropout=0.25, n_classes=1)
    return net

from torchkeras import summary

net = create_net()
summary(net, input_data=features);

"""
--------------------------------------------------------------------------
Layer (type)                            Output Shape              Param #
==========================================================================
Linear-1                                     [-1, 1]                   14
Embedding-2                              [-1, 26, 1]              241,338
NumEmbedding-3                           [-1, 13, 8]                  104
Embedding-4                              [-1, 26, 8]            1,930,704
FMLayer-5                                    [-1, 1]                    0
Linear-6                                   [-1, 128]               40,064
BatchNorm1d-7                              [-1, 128]                  256
ReLU-8                                     [-1, 128]                    0
Dropout-9                                  [-1, 128]                    0
Linear-10                                   [-1, 64]                8,256
BatchNorm1d-11                              [-1, 64]                  128
ReLU-12                                     [-1, 64]                    0
Dropout-13                                  [-1, 64]                    0
Linear-14                                   [-1, 32]                2,080
BatchNorm1d-15                              [-1, 32]                   64
ReLU-16                                     [-1, 32]                    0
Dropout-17                                  [-1, 32]                    0
Linear-18                                    [-1, 1]                   33
==========================================================================
Total params: 2,223,041
Trainable params: 2,223,041
Non-trainable params: 0
--------------------------------------------------------------------------
Input size (MB): 0.000084
Forward/backward pass size (MB): 0.009438
Params size (MB): 8.480228
Estimated Total Size (MB): 8.489750
--------------------------------------------------------------------------
"""
# 训练模型
# 我们使用梦中情炉torchkeras来实现最优雅的训练循环。
from torchkeras.metrics import AUC
from torchkeras import KerasModel

loss_fn = nn.BCEWithLogitsLoss()  # BCEWithLogitsLoss 接收模型的输出(logits),不需要手动应用 sigmoid 激活函数。
metrics_dict = {"auc": AUC()}

optimizer = torch.optim.Adam(net.parameters(), lr=0.002, weight_decay=0.001)

model = KerasModel(net, loss_fn=loss_fn, metrics_dict=metrics_dict, optimizer=optimizer)

dfhistory = model.fit(train_data=dl_train, val_data=dl_val, epochs=100, ckpt_path='checkpoint', 
                     patience=5, monitor='val_auc', mode='max', plot=True)

# 评估模型
model.evaluate(dl_test)

"""
{'val_loss': 0.7424652814865113, 'val_auc': 0.7181903719902039}
"""

# 使用模型
from sklearn.metrics import roc_auc_score

model.eval()
# 测试阶段:在评估模型之前,使用 accelerator.prepare() 可以确保测试数据以最佳方式加载,尤其是在使用多 GPU 时。
dl_test = model.accelerator.prepare(dl_test)

with torch.no_grad():
    result = torch.cat([model.forward(t[0]) for t in dl_test])

preds = F.sigmoid(result).cpu()
labels = torch.cat([x[-1] for x in dl_test]).cpu()

val_auc = roc_auc_score(labels.numpy(), preds.numpy())

print(val_auc)

"""
0.7181925663222225

"""

# 保存模型
# 模型最佳权重已经保存在 model.fit(ckpt_path) 传入的参数中了。
net_clone = create_net()
net_clone.load_state_dict(torch.load(model.ckpt_path))

"""
<All keys matched successfully>
"""
posted @ 2024-09-22 19:02  lotuslaw  阅读(43)  评论(0编辑  收藏  举报