CF算法
generate_train_data.py
import pandas as pd import user_cf import operator import item_cf data_path = 'G:\\Bigdata_object\\u.data' udata = pd.read_csv(data_path, sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp']) train = dict() # for _,row in udata.iloc[:2,:].iterrows(): for _, row in udata.iterrows(): user_id = str(row['user_id']) item_id = str(row['item_id']) rating = row['rating'] if train.get(user_id, -1) == -1: train[user_id] = dict() train[user_id][item_id] = rating # ###################user_cf test################### # # print(train) W = user_cf.user_similarity(train) # # print(sorted(W.get('1').items(), key=operator.itemgetter(1), reverse=True)[:10]) # rec_item_list = user_cf.recommend('1', train, W, 10) print(sorted(rec_item_list.items(), key=operator.itemgetter(1), reverse=True)[:20]) # ###################item_cf test################### W2 = item_cf.item_similarity(train) item_list = item_cf.recommend(train,'1',W2,10) print(sorted(item_list.items(), key=operator.itemgetter(1), reverse=True)[:20])
item_cf.py
import math import operator def item_similarity(train): # 计算item1与item2相同的user的数量 C = dict() # 存item与item相同user的个数 分子 N = dict() # item的用户数量 分母 for u, items in train.items(): for i in items: if N.get(i, -1) == -1: N[i] = 0 N[i] += 1 if C.get(i, -1) == -1: C[i] = dict() for j in items: if i == j: continue elif C[i].get(j, -1) == -1: C[i][j] = 0 C[i][j] += 1 # 加分母计算相似度 W = dict() for i, related_items in C.items(): if W.get(i, -1) == -1: W[i] = dict() for j, cij in related_items.items(): if W[i].get(j, -1) == -1: W[i][j] = 0 W[i][j] += cij / math.sqrt(N[i] * N[j]) return W def recommend(train, user, w, k): rank = dict() ru = train[user] for i, pi in ru.items(): for j, wj in sorted(w[i].items(), key=operator.itemgetter(1), reverse=True)[0:k]: if j in ru: continue elif rank.get(j, -1) == -1: rank[j] = 0 rank[j] += pi * wj return rank
user_cf.py
import operator import math # train 格式 :{user:{item:rating}} def user_similarity(train): # 建立item->users倒排表 item_users = dict() for u, items in train.items(): for i in items.keys(): if i not in item_users: item_users[i] = set() item_users[i].add(u) # 计算相似user共同的物品数量 C = dict() # 共同用户之间相同物品的数量 交集 N = dict() # 存储每个用户拥有的Item数量 分母 for i, users in item_users.items(): for u in users: if N.get(u, -1) == -1: N[u] = 0 N[u] += 1 if C.get(u, -1) == -1: C[u] = dict() for v in users: if u == v: continue elif C[u].get(v, -1) == -1: C[u][v] = 0 C[u][v] += 1 # C[u][v] += 1 / math.log(1 + len(users)) # 得到最终的相似度矩阵W W = dict() for u, related_users in C.items(): if W.get(u, -1) == -1: W[u] = dict() for v, cuv in related_users.items(): W[u][v] = cuv / math.sqrt(N[u] * N[v] * 1.0) return W # 相似用户的物品集合 def recommend(user, train, w, k): rank = dict() interacted_items = train[user].keys() for v, wuv in sorted(w[user].items(), key=operator.itemgetter(1), reverse=True)[0:k]: for i, rvi in train[v].items(): if i in interacted_items: # 过滤已经做过评价的电影 continue elif rank.get(i, -1) == -1: rank[i] = 0 rank[i] += wuv * rvi return rank
########## 今天的苦逼是为了不这样一直苦逼下去!##########