推荐模型NeuralCF(NCF):Torch实现

1. 使用MovieLens数据集搭建了一个简易torch版本的NCF模型,采用了1:1的负采样,跑10个epoch二分类准确率达到0.8左右,代码贴上:

 

import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
import random
config = {
    'batch_size': 128,
    'epochs': 10,
    'weight_decay': 1e-6,
    'num_classes': 2,
    'lr': 2e-5,
    'negative_sum':200
}
# 数据加载
data = pd.read_csv('../dataset/movieLens/ratings.dat',
                   sep='::', 
                   names=['userId','movieId','rating','time'], 
                   usecols=[0,1],
                   engine='python')
dataList = data.groupby(by='userId').agg({'movieId':list})
dataList['userId'] = dataList.index 
dataList.reset_index(drop=True)
movieIds = data.movieId.unique()

#负采样
negative = dict()
for userId in dataList['userId']:
    negatives = list()
    while len(negatives) < config['negative_sum']:
        movieId = random.randint(1, 3952)
        if movieId not in dataList.loc[userId].movieId:
            negatives.append(movieId)
    negative[userId] = negatives
negative = pd.DataFrame.from_dict(negative, orient='index')
negative['userId'] = negative.index
negative['movieId'] = negative.apply(lambda x: [x[i] for i in range(config['negative_sum'])], axis=1)
negative = negative[['userId', 'movieId']]
negative = negative.explode('movieId').reset_index(drop=True)
negative['label'] = 0
negative.head() 
data = data.explode('movieId').reset_index(drop=True)
data['label'] = 1
data.head()

# 测试集和训练集划分
data = pd.concat([data, negative]).astype(np.int32)
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,0:2], data.iloc[:, 2], test_size=0.15, random_state=2022)
x_test.shape

# 数据配置
from torch.utils.data import DataLoader, Dataset
class MovieDataset(Dataset):
    def __init__(self, x, y):
        super(MovieDataset, self).__init__()
        self.x = x
        self.y = y 
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    def __len__(self):
        return len(self.x)

import torch
train_dataset = MovieDataset(torch.tensor(x_train.values), torch.tensor(y_train.values))
test_dataset = MovieDataset(torch.tensor(x_test.values), torch.tensor(y_test.values))
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'])
# 模型搭建
import torch
import torch.nn as nn
class NeuralCF(nn.Module):
    def __init__(self, userIds, movieIds):
        super(NeuralCF, self).__init__()
        self.userEmb = nn.Embedding(userIds, 64)
        self.movieEmb = nn.Embedding(movieIds, 64)
        self.sequential = nn.Sequential(
            nn.Linear(128, 96),
            nn.ReLU(),
            nn.Linear(96, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        left = torch.LongTensor(x[:,0].numpy())
        right = torch.LongTensor(x[:,1].numpy())
        left = self.userEmb(left)
        right = self.movieEmb(right)
        x = torch.cat([left, right], dim=1)
        x = self.sequential(x)
        x = torch.squeeze(x)
        return x 

from sklearn.metrics import accuracy_score
# 定义损失函数
def criterion(outputs, labels):
    return nn.BCELoss()(outputs, labels)
# 计算准确率
def getAccuracy(outputs, labels):
    output = list()
    for i in outputs:
        if i < 0.5:
            output.append(0)
        else:
            output.append(1)
    return accuracy_score(labels, output)

# 训练
from tqdm.auto import tqdm 
import gc  
def train_one_epoch(epoch, model, dataLoader, optimizer):
    model.train()
    steps = len(dataLoader)
    bar = tqdm(enumerate(dataLoader), total=len(dataLoader))
    dataset_size = 0
    running_loss = 0
    for step, (x, y) in bar:
        batch_size = x.shape[0]
        outputs = model(x)
        y = torch.FloatTensor(y.numpy())
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        epoch_loss = running_loss / dataset_size
        
        bar.set_description(f'Epoch: [{epoch}/{10}]')
        bar.set_postfix(Epoch=epoch, Train_loss=epoch_loss)
    gc.collect()
    return running_loss

# 准确率验证
def valid_one_epoch(epoch, model, dataLoader):
    model.eval()
    bar = tqdm(enumerate(dataLoader), total=len(dataLoader))
    total_accuracy = 0
    item = 0
    for step, (x, y) in bar:
        item += 1
        batch_size = x.shape[0]
        outputs = model(x)
        total_accuracy += getAccuracy(outputs, y)
        bar.set_description(f'Epoch: [{epoch}/{10}]')
        bar.set_postfix(Epoch=epoch, Accuracy=total_accuracy/item)
    gc.collect()
    return total_accuracy

# 6041 和3953 分别为用户id数和电影id数
model = NeuralCF(6041, 3953)
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
for epoch in range(1, config['epochs'] + 1):
    train_one_epoch(epoch, model, train_loader, optimizer)
    valid_one_epoch(epoch, model, test_loader)
epoch 1  Accuracy :0.629
epoch 2  Accuracy :0.702
epoch 3  Accuracy :0.739
epoch 4  Accuracy :0.762
epoch 5  Accuracy :0.777
epoch 6  Accuracy :0.786
epoch 7  Accuracy :0.794
epoch 8  Accuracy :0.8
epoch 9  Accuracy :0.804
epoch 10  Accuracy :0.807

  

posted @ 2022-07-13 19:33  麦扣  阅读(495)  评论(0编辑  收藏  举报