推荐系统(三):基于pytorch实现DCN
一、main.py
import torch import tqdm from sklearn.metrics import roc_auc_score from torch.utils.data import DataLoader import os import numpy as np from torchfm.dataset.criteo import CriteoDataset from torchfm.model.dcn import DeepCrossNetworkModel from torchfm.model.fnn import FactorizationSupportedNeuralNetworkModel from torchfm.model.pnn import ProductNeuralNetworkModel def get_dataset(path): return CriteoDataset(path) def get_model(name, dataset): """ Hyperparameters are empirically determined, not opitmized. """ field_dims = dataset.field_dims if name == 'fnn': return FactorizationSupportedNeuralNetworkModel(field_dims, embed_dim=16, mlp_dims=(16, 16), dropout=0.2) elif name == 'ipnn': return ProductNeuralNetworkModel(field_dims, embed_dim=16, mlp_dims=(16,), method='inner', dropout=0.2) elif name == 'opnn': return ProductNeuralNetworkModel(field_dims, embed_dim=16, mlp_dims=(16,), method='outer', dropout=0.2) elif name == 'dcn': return DeepCrossNetworkModel(field_dims, embed_dim=16, num_layers=3, mlp_dims=(16, 16), dropout=0.2) else: raise ValueError('unknown model name: ' + name) class EarlyStopper(object): def __init__(self, num_trials, save_path): self.num_trials = num_trials self.trial_counter = 0 self.best_accuracy = 0 self.best_loss = 100 self.save_path = save_path def is_continuable(self, model, accuracy): if accuracy > self.best_accuracy: self.best_accuracy = accuracy self.trial_counter = 0 torch.save(model, self.save_path) return True elif self.trial_counter + 1 < self.num_trials: self.trial_counter += 1 return True else: return False def is_continue_loss(self, model, loss): if loss < self.best_loss: self.best_loss = loss self.trial_counter = 0 torch.save(model, self.save_path) return True elif self.trial_counter + 1 < self.num_trials: self.trial_counter += 1 return True else: return False def train(model, optimizer, data_loader, criterion, device, log_interval=100): model.train() total_loss = 0 tk0 = tqdm.tqdm(data_loader, smoothing=0, mininterval=1.0) for i, (fields, target) in enumerate(tk0): fields, target = fields.to(device), target.to(device) y = model(fields) loss = criterion(y, target.float()) model.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() if (i + 1) % log_interval == 0: tk0.set_postfix(loss=total_loss / log_interval) total_loss = 0 def my_test(model, data_loader, device, criterion): model.eval() targets, predicts = list(), list() with torch.no_grad(): for fields, target in tqdm.tqdm(data_loader, smoothing=0, mininterval=1.0): fields, target = fields.to(device), target.to(device) y = model(fields) targets.extend(target.tolist()) predicts.extend(y.tolist()) loss = criterion(y, target.float()) return loss, roc_auc_score(targets, predicts) def main(dataset_path, model_name, epoch, learning_rate, batch_size, weight_decay, device, save_dir): device = torch.device(device) dataset = get_dataset(dataset_path) dims = dataset.field_dims npy_out = "field_dims.npy" if not os.path.exists(npy_out): np.save(npy_out, dims) train_length = int(len(dataset) * 0.8) valid_length = int(len(dataset) * 0.1) test_length = len(dataset) - train_length - valid_length train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split( dataset, (train_length, valid_length, test_length)) train_data_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0) valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=0) test_data_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=0) model = get_model(model_name, dataset).to(device) criterion = torch.nn.BCELoss() optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=weight_decay) early_stopper = EarlyStopper(num_trials=5, save_path=f'{save_dir}/{model_name}.pt') for epoch_i in range(epoch): train(model, optimizer, train_data_loader, criterion, device) loss, auc = my_test(model, valid_data_loader, device, criterion) print('epoch:', epoch_i, 'validation: auc:', auc, "loss:", loss) if not early_stopper.is_continuable(model, auc): print(f'validation: best auc: {early_stopper.best_accuracy}') break loss, auc = my_test(model, test_data_loader, device, criterion) print(f'test loss: {loss}, test auc: {auc}') if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--dataset_path', default=r'E:\recommend_system\data\train.txt') parser.add_argument('--model_name', default='dcn') parser.add_argument('--epoch', type=int, default=100) parser.add_argument('--learning_rate', type=float, default=0.005) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--weight_decay', type=float, default=1e-6) parser.add_argument('--device', default='cpu') parser.add_argument('--save_dir', default='E:/recommend_system/chkpt') args = parser.parse_args() main(args.dataset_path, args.model_name, args.epoch, args.learning_rate, args.batch_size, args.weight_decay, args.device, args.save_dir)
二、predicts.py
import numpy as np from functools import lru_cache import math import torch @lru_cache(maxsize=None) def convert_numeric_feature(val: str): if val == '': return 'NULL' v = int(val) if v > 2: return str(int(math.log(v) ** 2)) else: return str(v - 2) class InferencePredict(): def __init__(self): mapper_path = "mapper.npy" self.model_path = r"E:\recommend_system\chkpt\dcn.pt" """数值和类别总列数""" self.NUM_FEATS = 23 """数值类别""" self.NUM_INT_FEATS = 7 read_dictionary = np.load(mapper_path, allow_pickle=True).item() self.feat_mapper = read_dictionary["feat_mapper"] self.defaults = read_dictionary["defaults"] self.model = torch.load(self.model_path) self.model.eval() def get_data(self, dataline, feat_mapper,defaults): values = dataline.rstrip('\n').split('\t') np_array = np.zeros(self.NUM_FEATS + 1, dtype=np.uint32) np_array[0] = int(values[0]) for i in range(1, self.NUM_INT_FEATS + 1): np_array[i] = feat_mapper[i].get(convert_numeric_feature(values[i]), defaults[i]) for i in range(self.NUM_INT_FEATS + 1, self.NUM_FEATS + 1): np_array[i] = feat_mapper[i].get(values[i], defaults[i]) return np_array def predict(self, dataline): np_array = self.get_data(dataline, self.feat_mapper,self.defaults).astype(dtype=np.long) x,y = np_array[1:], np_array[0] with torch.no_grad(): output = self.model(torch.from_numpy(x)) print(output) preds = int(torch.round(output).item()) return preds, y def main(self): s1 = "1 450 18 47 619 153 12 0 male daily 1 0 0 0 0 MD280612 BR803759 PV285968 CT470265 PF470265 10 0 2 0 W441,W19887,W45818,W63,W894,W38883,W135945,W1684,W72349,W15298,W858,W38883" s2 = "0 290 0 34 0 0 12 5 male monthly 1 0 0 0 0 MD441850 BR803759 PV419710 CT378940 PF470265 0 0 3 0 W605,W396,W554,W6275,W6257,W651,W51,W32265,W682,W250,W97,W1748" s3 = "0 290 0 34 0 0 12 5 male monthly 1 0 0 0 0 MD441850 BR803759 PV419710 CT378940 PF470265 0 0 3 0 W605,W396,W554,W6275,W6257,W651,W51,W32265,W682,W250,W97,W1748" self.predict(s1) self.predict(s2) self.predict(s3) if __name__ == '__main__': InferencePredict().main()
三、critio.py
import math import shutil import struct from collections import defaultdict from functools import lru_cache from pathlib import Path import lmdb import numpy as np import torch.utils.data from tqdm import tqdm import os class CriteoDataset(torch.utils.data.Dataset): """ Criteo Display Advertising Challenge Dataset Data prepration: * Remove the infrequent features (appearing in less than threshold instances) and treat them as a single feature * Discretize numerical values by log2 transformation which is proposed by the winner of Criteo Competition :param dataset_path: criteo train.txt path. :param cache_path: lmdb cache path. :param rebuild_cache: If True, lmdb cache is refreshed. :param min_threshold: infrequent feature threshold. Reference: https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset https://www.csie.ntu.edu.tw/~r01922136/kaggle-2014-criteo.pdf """ def __init__(self, dataset_path=None, cache_path='.criteo', rebuild_cache=True, min_threshold=10): """数值和类别总列数""" self.NUM_FEATS = 23 """数值类别""" self.NUM_INT_FEATS = 7 self.min_threshold = min_threshold if rebuild_cache or not Path(cache_path).exists(): shutil.rmtree(cache_path, ignore_errors=True) if dataset_path is None: raise ValueError('create cache: failed: dataset_path is None') self.__build_cache(dataset_path, cache_path) self.env = lmdb.open(cache_path, create=False, lock=False, readonly=True) with self.env.begin(write=False) as txn: self.length = txn.stat()['entries'] - 1 self.field_dims = np.frombuffer(txn.get(b'field_dims'), dtype=np.uint32) def __getitem__(self, index): with self.env.begin(write=False) as txn: np_array = np.frombuffer( txn.get(struct.pack('>I', index)), dtype=np.uint32).astype(dtype=np.long) return np_array[1:], np_array[0] def __len__(self): return self.length def __build_cache(self, path, cache_path): feat_mapper, defaults = self.__get_feat_mapper(path) my_dict = {"feat_mapper":feat_mapper, "defaults": defaults} my_path = "mapper.npy" if not os.path.exists(my_path): np.save(my_path, my_dict) with lmdb.open(cache_path, map_size=int(1e11)) as env: field_dims = np.zeros(self.NUM_FEATS, dtype=np.uint32) for i, fm in feat_mapper.items(): field_dims[i - 1] = len(fm) + 1 with env.begin(write=True) as txn: txn.put(b'field_dims', field_dims.tobytes()) for buffer in self.__yield_buffer(path, feat_mapper, defaults): with env.begin(write=True) as txn: for key, value in buffer: txn.put(key, value) def __get_feat_mapper(self, path): feat_cnts = defaultdict(lambda: defaultdict(int)) with open(path) as f: pbar = tqdm(f, mininterval=1, smoothing=0.1) pbar.set_description('Create criteo dataset cache: counting features') for line in pbar: values = line.rstrip('\n').split('\t') for i in range(1, self.NUM_INT_FEATS + 1): feat_cnts[i][convert_numeric_feature(values[i])] += 1 for i in range(self.NUM_INT_FEATS + 1, self.NUM_FEATS + 1): feat_cnts[i][values[i]] += 1 feat_mapper = {i: {feat for feat, c in cnt.items() if c >= self.min_threshold} for i, cnt in feat_cnts.items()} feat_mapper = {i: {feat: idx for idx, feat in enumerate(cnt)} for i, cnt in feat_mapper.items()} defaults = {i: len(cnt) for i, cnt in feat_mapper.items()} return feat_mapper, defaults def __yield_buffer(self, path, feat_mapper, defaults, buffer_size=int(1e5)): item_idx = 0 buffer = list() with open(path) as f: pbar = tqdm(f, mininterval=1, smoothing=0.1) pbar.set_description('Create criteo dataset cache: setup lmdb') for line in pbar: values = line.rstrip('\n').split('\t') np_array = np.zeros(self.NUM_FEATS + 1, dtype=np.uint32) np_array[0] = int(values[0]) for i in range(1, self.NUM_INT_FEATS + 1): np_array[i] = feat_mapper[i].get(convert_numeric_feature(values[i]), defaults[i]) for i in range(self.NUM_INT_FEATS + 1, self.NUM_FEATS + 1): np_array[i] = feat_mapper[i].get(values[i], defaults[i]) buffer.append((struct.pack('>I', item_idx), np_array.tobytes())) item_idx += 1 if item_idx % buffer_size == 0: yield buffer buffer.clear() yield buffer @lru_cache(maxsize=None) def convert_numeric_feature(val: str): if val == '': return 'NULL' v = int(val) if v > 2: return str(int(math.log(v) ** 2)) else: return str(v - 2)
四、DCN.py
import torch from torchfm.layer import FeaturesEmbedding, CrossNetwork, MultiLayerPerceptron class DeepCrossNetworkModel(torch.nn.Module): """ A pytorch implementation of Deep & Cross Network. Reference: R Wang, et al. Deep & Cross Network for Ad Click Predictions, 2017. """ def __init__(self, field_dims, embed_dim, num_layers, mlp_dims, dropout): super().__init__() self.embedding = FeaturesEmbedding(field_dims, embed_dim) self.embed_output_dim = len(field_dims) * embed_dim self.cn = CrossNetwork(self.embed_output_dim, num_layers) self.mlp = MultiLayerPerceptron(self.embed_output_dim, mlp_dims, dropout, output_layer=False) self.linear = torch.nn.Linear(mlp_dims[-1] + self.embed_output_dim, 1) def forward(self, x): """ :param x: Long tensor of size ``(batch_size, num_fields)`` """ embed_x = self.embedding(x).view(-1, self.embed_output_dim) x_l1 = self.cn(embed_x) h_l2 = self.mlp(embed_x) x_stack = torch.cat([x_l1, h_l2], dim=1) p = self.linear(x_stack) return torch.sigmoid(p.squeeze(1))
五、layer.py
import numpy as np import torch import torch.nn.functional as F class FeaturesLinear(torch.nn.Module): def __init__(self, field_dims, output_dim=1): super().__init__() self.fc = torch.nn.Embedding(sum(field_dims), output_dim) self.bias = torch.nn.Parameter(torch.zeros((output_dim,))) self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long) def forward(self, x): """ :param x: Long tensor of size ``(batch_size, num_fields)`` """ x = x + x.new_tensor(self.offsets).unsqueeze(0) return torch.sum(self.fc(x), dim=1) + self.bias class FeaturesEmbedding(torch.nn.Module): def __init__(self, field_dims, embed_dim): super().__init__() self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim) self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long) torch.nn.init.xavier_uniform_(self.embedding.weight.data) def forward(self, x): """ :param x: Long tensor of size ``(batch_size, num_fields)`` """ x = x + x.new_tensor(self.offsets).unsqueeze(0) return self.embedding(x) class FieldAwareFactorizationMachine(torch.nn.Module): def __init__(self, field_dims, embed_dim): super().__init__() self.num_fields = len(field_dims) self.embeddings = torch.nn.ModuleList([ torch.nn.Embedding(sum(field_dims), embed_dim) for _ in range(self.num_fields) ]) self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long) for embedding in self.embeddings: torch.nn.init.xavier_uniform_(embedding.weight.data) def forward(self, x): """ :param x: Long tensor of size ``(batch_size, num_fields)`` """ x = x + x.new_tensor(self.offsets).unsqueeze(0) xs = [self.embeddings[i](x) for i in range(self.num_fields)] ix = list() for i in range(self.num_fields - 1): for j in range(i + 1, self.num_fields): ix.append(xs[j][:, i] * xs[i][:, j]) ix = torch.stack(ix, dim=1) return ix class FactorizationMachine(torch.nn.Module): def __init__(self, reduce_sum=True): super().__init__() self.reduce_sum = reduce_sum def forward(self, x): """ :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)`` """ square_of_sum = torch.sum(x, dim=1) ** 2 sum_of_square = torch.sum(x ** 2, dim=1) ix = square_of_sum - sum_of_square if self.reduce_sum: ix = torch.sum(ix, dim=1, keepdim=True) return 0.5 * ix class MultiLayerPerceptron(torch.nn.Module): def __init__(self, input_dim, embed_dims, dropout, output_layer=True): super().__init__() layers = list() for embed_dim in embed_dims: layers.append(torch.nn.Linear(input_dim, embed_dim)) layers.append(torch.nn.BatchNorm1d(embed_dim)) layers.append(torch.nn.ReLU()) layers.append(torch.nn.Dropout(p=dropout)) input_dim = embed_dim if output_layer: layers.append(torch.nn.Linear(input_dim, 1)) self.mlp = torch.nn.Sequential(*layers) def forward(self, x): """ :param x: Float tensor of size ``(batch_size, embed_dim)`` """ return self.mlp(x) class InnerProductNetwork(torch.nn.Module): def forward(self, x): """ :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)`` """ num_fields = x.shape[1] row, col = list(), list() for i in range(num_fields - 1): for j in range(i + 1, num_fields): row.append(i), col.append(j) return torch.sum(x[:, row] * x[:, col], dim=2) class OuterProductNetwork(torch.nn.Module): def __init__(self, num_fields, embed_dim, kernel_type='mat'): super().__init__() num_ix = num_fields * (num_fields - 1) // 2 if kernel_type == 'mat': kernel_shape = embed_dim, num_ix, embed_dim elif kernel_type == 'vec': kernel_shape = num_ix, embed_dim elif kernel_type == 'num': kernel_shape = num_ix, 1 else: raise ValueError('unknown kernel type: ' + kernel_type) self.kernel_type = kernel_type self.kernel = torch.nn.Parameter(torch.zeros(kernel_shape)) torch.nn.init.xavier_uniform_(self.kernel.data) def forward(self, x): """ :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)`` """ num_fields = x.shape[1] row, col = list(), list() for i in range(num_fields - 1): for j in range(i + 1, num_fields): row.append(i), col.append(j) p, q = x[:, row], x[:, col] if self.kernel_type == 'mat': kp = torch.sum(p.unsqueeze(1) * self.kernel, dim=-1).permute(0, 2, 1) return torch.sum(kp * q, -1) else: return torch.sum(p * q * self.kernel.unsqueeze(0), -1) class CrossNetwork(torch.nn.Module): def __init__(self, input_dim, num_layers): super().__init__() self.num_layers = num_layers self.w = torch.nn.ModuleList([ torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers) ]) self.b = torch.nn.ParameterList([ torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers) ]) def forward(self, x): """ :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)`` """ x0 = x for i in range(self.num_layers): xw = self.w[i](x) x = x0 * xw + self.b[i] + x return x class AttentionalFactorizationMachine(torch.nn.Module): def __init__(self, embed_dim, attn_size, dropouts): super().__init__() self.attention = torch.nn.Linear(embed_dim, attn_size) self.projection = torch.nn.Linear(attn_size, 1) self.fc = torch.nn.Linear(embed_dim, 1) self.dropouts = dropouts def forward(self, x): """ :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)`` """ num_fields = x.shape[1] row, col = list(), list() for i in range(num_fields - 1): for j in range(i + 1, num_fields): row.append(i), col.append(j) p, q = x[:, row], x[:, col] inner_product = p * q attn_scores = F.relu(self.attention(inner_product)) attn_scores = F.softmax(self.projection(attn_scores), dim=1) attn_scores = F.dropout(attn_scores, p=self.dropouts[0], training=self.training) attn_output = torch.sum(attn_scores * inner_product, dim=1) attn_output = F.dropout(attn_output, p=self.dropouts[1], training=self.training) return self.fc(attn_output) class CompressedInteractionNetwork(torch.nn.Module): def __init__(self, input_dim, cross_layer_sizes, split_half=True): super().__init__() self.num_layers = len(cross_layer_sizes) self.split_half = split_half self.conv_layers = torch.nn.ModuleList() prev_dim, fc_input_dim = input_dim, 0 for i in range(self.num_layers): cross_layer_size = cross_layer_sizes[i] self.conv_layers.append(torch.nn.Conv1d(input_dim * prev_dim, cross_layer_size, 1, stride=1, dilation=1, bias=True)) if self.split_half and i != self.num_layers - 1: cross_layer_size //= 2 prev_dim = cross_layer_size fc_input_dim += prev_dim self.fc = torch.nn.Linear(fc_input_dim, 1) def forward(self, x): """ :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)`` """ xs = list() x0, h = x.unsqueeze(2), x for i in range(self.num_layers): x = x0 * h.unsqueeze(1) batch_size, f0_dim, fin_dim, embed_dim = x.shape x = x.view(batch_size, f0_dim * fin_dim, embed_dim) x = F.relu(self.conv_layers[i](x)) if self.split_half and i != self.num_layers - 1: x, h = torch.split(x, x.shape[1] // 2, dim=1) else: h = x xs.append(x) return self.fc(torch.sum(torch.cat(xs, dim=1), 2)) class AnovaKernel(torch.nn.Module): def __init__(self, order, reduce_sum=True): super().__init__() self.order = order self.reduce_sum = reduce_sum def forward(self, x): """ :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)`` """ batch_size, num_fields, embed_dim = x.shape a_prev = torch.ones((batch_size, num_fields + 1, embed_dim), dtype=torch.float).to(x.device) for t in range(self.order): a = torch.zeros((batch_size, num_fields + 1, embed_dim), dtype=torch.float).to(x.device) a[:, t+1:, :] += x[:, t:, :] * a_prev[:, t:-1, :] a = torch.cumsum(a, dim=1) a_prev = a if self.reduce_sum: return torch.sum(a[:, -1, :], dim=-1, keepdim=True) else: return a[:, -1, :]