协同过滤——推荐电影
协同过滤
1. 协同过滤的简介
关于协同过滤的一个最经典的例子就是看电影,有时候不知道哪一部电影是我们喜欢的或者评分比较高的,那
么通常的做法就是问问周围的朋友,看看最近有什么好的电影推荐。在问的时候,都习惯于问跟自己口味差不
多的朋友,这就是协同过滤的核心思想。
协同过滤是在海量数据中挖掘出小部分与你品味类似的用户,在协同过滤中,这些用户成为邻居,然后根据他
们喜欢的东西组织成一个排序的目录推荐给你。所以就有如下两个核心问题
(1)如何确定一个用户是否与你有相似的品味?
(2)如何将邻居们的喜好组织成一个排序目录?
协同过滤算法的出现标志着推荐系统的产生,协同过滤算法包括基于用户和基于物品的协同过滤算法。
2. 协同过滤的核心
要实现协同过滤,需要进行如下几个步骤
(1)收集用户偏好
(2)找到相似的用户或者物品
(3)计算并推荐
代码:
1 # -*-coding=utf-8 -*- 2 3 import sys 4 import math 5 from texttable import Texttable 6 7 8 # 计算余弦距离 9 def getCosDist(user1, user2): 10 sum_x = 0.0 11 sum_y = 0.0 12 sum_xy = 0.0 13 for key1 in user1: 14 for key2 in user2: 15 if key1[0] == key2[0]: 16 sum_x += key1[1] * key1[1] 17 sum_y += key2[1] * key2[1] 18 sum_xy += key1[1] * key2[1] 19 if sum_xy == 0.0: 20 return 0 21 demo = math.sqrt(sum_x * sum_y) 22 return sum_xy / demo 23 24 25 # 读取文件,读取以行为单位,每一行是列表里的一个元素 26 def readFile(filename): 27 contents = [] 28 f = open(filename, "r") 29 contents = f.readlines() 30 f.close() 31 return contents 32 33 34 # 数据格式化为二维数组 35 def getRatingInfo(ratings): 36 rates = [] 37 ratings.pop(0) 38 for line in ratings: 39 rate = line.split(",") 40 rates.append([int(rate[0]), int(rate[1]), float(rate[2])]) 41 return rates 42 43 44 # 生成用户评分数据结构 45 def getUserScoreDataStructure(rates): 46 # userDict[2]=[(1,5),(4,2)].... 表示用户2对电影1的评分是5,对电影4的评分是2 47 userDict = {} 48 itemUser = {} 49 for k in rates: 50 user_rank = (k[1], k[2]) 51 if k[0] in userDict: 52 userDict[k[0]].append(user_rank) 53 else: 54 userDict[k[0]] = [user_rank] 55 56 if k[1] in itemUser: 57 itemUser[k[1]].append(k[0]) 58 else: 59 itemUser[k[1]] = [k[0]] 60 return userDict, itemUser 61 62 63 # 计算与指定用户最相近的邻居 64 def getNearestNeighbor(userId, userDict, itemUser): 65 neighbors = [] 66 for item in userDict[userId]: 67 for neighbor in itemUser[item[0]]: 68 if neighbor != userId and neighbor not in neighbors: 69 neighbors.append(neighbor) 70 neighbors_dist = [] 71 for neighbor in neighbors: 72 dist = getCosDist(userDict[userId], userDict[neighbor]) 73 neighbors_dist.append([dist, neighbor]) 74 neighbors_dist.sort(reverse=True) 75 return neighbors_dist 76 77 78 # 使用UserFC进行推荐,输入:文件名,用户ID,邻居数量 79 def recommendByUserFC(filename, userId, k=5): 80 # 读取文件 81 contents = readFile(filename) 82 83 # 文件格式数据转化为二维数组 84 rates = getRatingInfo(contents) 85 # 格式化成字典数据 86 userDict, itemUser = getUserScoreDataStructure(rates) 87 88 # 找邻居 89 neighbors = getNearestNeighbor(userId, userDict, itemUser)[:5] 90 # 建立推荐字典 91 recommand_dict = {} 92 for neighbor in neighbors: 93 neighbor_user_id = neighbor[1] 94 movies = userDict[neighbor_user_id] 95 for movie in movies: 96 if movie[0] not in recommand_dict: 97 recommand_dict[movie[0]] = neighbor[0] 98 else: 99 recommand_dict[movie[0]] += neighbor[0] 100 101 # 建立推荐列表 102 recommand_list = [] 103 for key in recommand_dict: 104 recommand_list.append([recommand_dict[key], key]) 105 recommand_list.sort(reverse=True) 106 user_movies = [k[0] for k in userDict[userId]] 107 return [k[1] for k in recommand_list], user_movies, itemUser, neighbors 108 109 110 # 获取电影的列表 111 def getMovieList(filename): 112 contents = readFile(filename) 113 movies_info = {} 114 for movie in contents: 115 movie = movie.strip("\n") 116 single_info = movie.split(",") 117 movies_info[int(single_info[0])] = single_info[1], single_info[-1].split('|') 118 return movies_info 119 120 121 # 从这里开始运行 122 if __name__ == '__main__': 123 124 reload(sys) 125 sys.setdefaultencoding('utf-8') 126 127 # 获取所有电影的列表 128 movies = getMovieList("movies.csv") 129 recommend_list, user_movie, items_movie, neighbors = recommendByUserFC("ratings.csv", 50, 80) 130 neighbors_id = [i[1] for i in neighbors] 131 table = Texttable() 132 table.set_deco(Texttable.HEADER) 133 table.set_cols_dtype(['t', 't', 't']) 134 table.set_cols_align(["l", "l", "l"]) 135 rows = [] 136 rows.append([u"movie name"]) 137 # print(len(recommend_list), user_movie, items_movie, neighbors) 138 for movie_id in recommend_list[:20]: 139 # from_user = [] 140 # for user_id in items_movie[movie_id]: 141 # if user_id in neighbors_id: 142 # from_user.append(user_id) 143 print [movies[movie_id][0]] 144 rows.append([movies[movie_id][0]]) 145 table.add_rows(rows) 146 print table.draw()