推荐模型NeuralCF(NCF):Torch实现
1. 使用MovieLens数据集搭建了一个简易torch版本的NCF模型,采用了1:1的负采样,跑10个epoch二分类准确率达到0.8左右,代码贴上:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
config = {
'batch_size': 128,
'epochs': 10,
'weight_decay': 1e-6,
'num_classes': 2,
'lr': 2e-5,
'negative_sum':200
}
# 数据加载
data = pd.read_csv('../dataset/movieLens/ratings.dat',
sep='::',
names=['userId','movieId','rating','time'],
usecols=[0,1],
engine='python')
dataList = data.groupby(by='userId').agg({'movieId':list})
dataList['userId'] = dataList.index
dataList.reset_index(drop=True)
movieIds = data.movieId.unique()
#负采样
negative = dict()
for userId in dataList['userId']:
negatives = list()
while len(negatives) < config['negative_sum']:
movieId = random.randint(1, 3952)
if movieId not in dataList.loc[userId].movieId:
negatives.append(movieId)
negative[userId] = negatives
negative = pd.DataFrame.from_dict(negative, orient='index')
negative['userId'] = negative.index
negative['movieId'] = negative.apply(lambda x: [x[i] for i in range(config['negative_sum'])], axis=1)
negative = negative[['userId', 'movieId']]
negative = negative.explode('movieId').reset_index(drop=True)
negative['label'] = 0
negative.head()
data = data.explode('movieId').reset_index(drop=True)
data['label'] = 1
data.head()
# 测试集和训练集划分
data = pd.concat([data, negative]).astype(np.int32)
x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,0:2], data.iloc[:, 2], test_size=0.15, random_state=2022)
x_test.shape
# 数据配置
from torch.utils.data import DataLoader, Dataset
class MovieDataset(Dataset):
def __init__(self, x, y):
super(MovieDataset, self).__init__()
self.x = x
self.y = y
def __getitem__(self, idx):
return self.x[idx], self.y[idx]
def __len__(self):
return len(self.x)
import torch
train_dataset = MovieDataset(torch.tensor(x_train.values), torch.tensor(y_train.values))
test_dataset = MovieDataset(torch.tensor(x_test.values), torch.tensor(y_test.values))
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'])
# 模型搭建
import torch
import torch.nn as nn
class NeuralCF(nn.Module):
def __init__(self, userIds, movieIds):
super(NeuralCF, self).__init__()
self.userEmb = nn.Embedding(userIds, 64)
self.movieEmb = nn.Embedding(movieIds, 64)
self.sequential = nn.Sequential(
nn.Linear(128, 96),
nn.ReLU(),
nn.Linear(96, 64),
nn.ReLU(),
nn.Linear(64, 1),
nn.Sigmoid()
)
def forward(self, x):
left = torch.LongTensor(x[:,0].numpy())
right = torch.LongTensor(x[:,1].numpy())
left = self.userEmb(left)
right = self.movieEmb(right)
x = torch.cat([left, right], dim=1)
x = self.sequential(x)
x = torch.squeeze(x)
return x
from sklearn.metrics import accuracy_score
# 定义损失函数
def criterion(outputs, labels):
return nn.BCELoss()(outputs, labels)
# 计算准确率
def getAccuracy(outputs, labels):
output = list()
for i in outputs:
if i < 0.5:
output.append(0)
else:
output.append(1)
return accuracy_score(labels, output)
# 训练
from tqdm.auto import tqdm
import gc
def train_one_epoch(epoch, model, dataLoader, optimizer):
model.train()
steps = len(dataLoader)
bar = tqdm(enumerate(dataLoader), total=len(dataLoader))
dataset_size = 0
running_loss = 0
for step, (x, y) in bar:
batch_size = x.shape[0]
outputs = model(x)
y = torch.FloatTensor(y.numpy())
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
running_loss += (loss.item() * batch_size)
dataset_size += batch_size
epoch_loss = running_loss / dataset_size
bar.set_description(f'Epoch: [{epoch}/{10}]')
bar.set_postfix(Epoch=epoch, Train_loss=epoch_loss)
gc.collect()
return running_loss
# 准确率验证
def valid_one_epoch(epoch, model, dataLoader):
model.eval()
bar = tqdm(enumerate(dataLoader), total=len(dataLoader))
total_accuracy = 0
item = 0
for step, (x, y) in bar:
item += 1
batch_size = x.shape[0]
outputs = model(x)
total_accuracy += getAccuracy(outputs, y)
bar.set_description(f'Epoch: [{epoch}/{10}]')
bar.set_postfix(Epoch=epoch, Accuracy=total_accuracy/item)
gc.collect()
return total_accuracy
# 6041 和3953 分别为用户id数和电影id数
model = NeuralCF(6041, 3953)
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
for epoch in range(1, config['epochs'] + 1):
train_one_epoch(epoch, model, train_loader, optimizer)
valid_one_epoch(epoch, model, test_loader)
epoch 1 Accuracy :0.629
epoch 2 Accuracy :0.702
epoch 3 Accuracy :0.739
epoch 4 Accuracy :0.762
epoch 5 Accuracy :0.777
epoch 6 Accuracy :0.786
epoch 7 Accuracy :0.794
epoch 8 Accuracy :0.8
epoch 9 Accuracy :0.804
epoch 10 Accuracy :0.807