协同过滤算法简单实现
以下的代码主要是来自《推荐系统实践》第二章节,修改了一些书上的错误,简单的实现了基于用户的协同过滤算法和基于物品的协同过滤算法,可供参考:
import math import random from collections import defaultdict from operator import itemgetter def user_similarity(train): """ 基于用户的协同过滤算法UserCF :param train: 训练集 :return: 用户相似度矩阵 """ # build inverse table for item_users item_users = dict() for u, items in train.items(): for i in items.keys(): if i not in item_users: item_users[i] = set() item_users[i].add(u) # calculate co-rated items between users c = dict() n = defaultdict(int) for i, users in item_users.items(): for u in users: n[u] += 1 for v in users: if u == v: continue c.setdefault(u, defaultdict(int)) c[u][v] += 1 # calculate finial similarity matrix w w = dict() for u, related_users in c.items(): for v, cuv in related_users.items(): w.setdefault(u, defaultdict(int)) w[u][v] = cuv / math.sqrt(n[u] * n[v]) return w def user_similarity2(train): """ 基于用户的协同过滤算法UserCF-IIF,添加热门物品惩罚因子 :param train: 训练集 :return: 用户相似度矩阵 """ # build inverse table for item_users item_users = dict() for u, items in train.items(): for i in items.keys(): if i not in item_users: item_users[i] = set() item_users[i].add(u) # calculate co-rated items between users c = dict() n = defaultdict(int) for i, users in item_users.items(): for u in users: n[u] += 1 for v in users: if u == v: continue c.setdefault(u, defaultdict(int)) # 添加热门物品惩罚因子 c[u][v] += 1 / math.log(1 + len(users)) # calculate finial similarity matrix w w = dict() for u, related_users in c.items(): for v, cuv in related_users.items(): w.setdefault(u, defaultdict(int)) w[u][v] = cuv / math.sqrt(n[u] * n[v]) return w def item_similarity(train): """ 基于物品的协同过滤算法ItemCF :param train: 训练集 :return: 物品相似度矩阵 """ # calculate co-rated users between items c = dict() n = defaultdict(int) for users, items in train.items(): for i in items: n[i] += 1 c.setdefault(i, dict()) for j in items: if i == j: continue c[i].setdefault(j, 0) c[i][j] += 1 # calculate finial similarity matrix w w = dict() for i, related_items in c.items(): for j, cij in related_items.items(): w.setdefault(i, defaultdict(float)) w[i][j] = cij / math.sqrt(n[i] * n[j]) return w def item_similarity2(train): """ 基于物品的协同过滤算法ItemCF-IUF,添加对活跃性用户的惩罚因子 :param train: 训练集 :return: 物品相似度矩阵 """ # calculate co-rated users between items c = dict() n = defaultdict(int) for users, items in train.items(): for i in items: n[i] += 1 c.setdefault(i, dict()) for j in items: if i == j: continue c[i].setdefault(j, 0) # ItemCF-IUF 添加对活跃性用户的惩罚因子 c[i][j] += 1 / math.log(1 + len(items) * 1.0) # calculate finial similarity matrix w w = dict() for i, related_items in c.items(): for j, cij in related_items.items(): w.setdefault(i, defaultdict(float)) w[i][j] = cij / math.sqrt(n[i] * n[j]) return w def item_similarity3(train): """ 基于物品的协同过滤算法ItemCF-IUF,添加对活跃性用户的惩罚因子以及对相似矩阵的归一化处理 :param train: 训练集 :return: 物品相似度矩阵 """ # calculate co-rated users between items c = dict() n = defaultdict(int) for users, items in train.items(): for i in items: n[i] += 1 c.setdefault(i, dict()) for j in items: if i == j: continue c[i].setdefault(j, 0) # ItemCF-IUF 添加对活跃性用户的惩罚因子 c[i][j] += 1 / math.log(1 + len(items) * 1.0) # calculate finial similarity matrix w w = dict() for i, related_items in c.items(): for j, cij in related_items.items(): w.setdefault(i, defaultdict(float)) w[i][j] = cij / math.sqrt(n[i] * n[j]) # 添加对相似矩阵的归一化处理 for item in w: max_value = max(w[item].values()) for items_related in w[item]: w[item][items_related] /= max_value return w def recommend_by_item(train, user_id, w, k): rank = defaultdict(float) ru = train[user_id] for i, pi in ru.items(): for j, wj in sorted(w[i].items(), key=itemgetter(1), reverse=True)[0:k]: if j in ru: continue rank[j] += pi * wj return rank def recommend_by_user(user, train, w, k): rank = defaultdict(float) interacted_items = train[user] for v, wuv in sorted(w[user].items(), key=itemgetter(1), reverse=True)[0:k]: for i, rvi in train[v].items(): if i in interacted_items: # we should filter items user interacted before continue rank[i] += wuv * rvi return rank if __name__ == '__main__': train = {'A': {'a': 1, 'b': 1, 'd': 1}, 'B': {'a': 1, 'c': 1}, 'C': {'b': 1, 'e': 1}, 'D': {'c': 1, 'd': 1, 'e': 1}} rank = recommend_by_user('A', train, user_similarity(train), 3) print('UserCF:', dict(rank)) rank2 = recommend_by_user('A', train, user_similarity2(train), 3) print('UserCF-IIF:', dict(rank2)) train2 = {'A': {'a': 1, 'b': 1, 'd': 1}, 'B': {'b': 1, 'c': 1, 'e': 1}, 'C': {'c': 1, 'd': 1}, 'D': {'b': 1, 'c': 1, 'd': 1}, 'E': {'a': 1, 'd': 1}} rank3 = recommend_by_item(train2, 'A', item_similarity(train2), 5) print('ItemCF:', dict(rank3)) rank4 = recommend_by_item(train2, 'A', item_similarity2(train2), 5) print('ItemCF-IUF:', dict(rank4)) rank5 = recommend_by_item(train2, 'A', item_similarity3(train2), 5) print('ItemCF-IUF+Normalization:', dict(rank5))
本博客文章皆出于学习目的,个人总结或摘抄整理自网络。引用参考部分在文章中都有原文链接,如疏忽未给出请联系本人。另外,作为一名菜鸟程序媛,如文章内容有错误,欢迎点击博客右上方的扣扣链接指导交流。