协同过滤——推荐电影

协同过滤

 

1. 协同过滤的简介



关于协同过滤的一个最经典的例子就是看电影,有时候不知道哪一部电影是我们喜欢的或者评分比较高的,那
么通常的做法就是问问周围的朋友,看看最近有什么好的电影推荐。在问的时候,都习惯于问跟自己口味差不
多的朋友,这就是协同过滤的核心思想。

协同过滤是在海量数据中挖掘出小部分与你品味类似的用户,在协同过滤中,这些用户成为邻居,然后根据他
们喜欢的东西组织成一个排序的目录推荐给你。所以就有如下两个核心问题

(1)如何确定一个用户是否与你有相似的品味?
(2)如何将邻居们的喜好组织成一个排序目录?

协同过滤算法的出现标志着推荐系统的产生,协同过滤算法包括基于用户和基于物品的协同过滤算法。

2. 协同过滤的核心


 

要实现协同过滤,需要进行如下几个步骤
(1)收集用户偏好
(2)找到相似的用户或者物品
(3)计算并推荐

代码:

  1 # -*-coding=utf-8 -*-
  2 
  3 import sys
  4 import math
  5 from texttable import Texttable
  6 
  7 
  8 # 计算余弦距离
  9 def getCosDist(user1, user2):
 10 sum_x = 0.0
 11 sum_y = 0.0
 12 sum_xy = 0.0
 13 for key1 in user1:
 14 for key2 in user2:
 15 if key1[0] == key2[0]:
 16 sum_x += key1[1] * key1[1]
 17 sum_y += key2[1] * key2[1]
 18 sum_xy += key1[1] * key2[1]
 19 if sum_xy == 0.0:
 20 return 0
 21 demo = math.sqrt(sum_x * sum_y)
 22 return sum_xy / demo
 23 
 24 
 25 # 读取文件,读取以行为单位,每一行是列表里的一个元素
 26 def readFile(filename):
 27 contents = []
 28 f = open(filename, "r")
 29 contents = f.readlines()
 30 f.close()
 31 return contents
 32 
 33 
 34 # 数据格式化为二维数组
 35 def getRatingInfo(ratings):
 36 rates = []
 37 ratings.pop(0)
 38 for line in ratings:
 39 rate = line.split(",")
 40 rates.append([int(rate[0]), int(rate[1]), float(rate[2])])
 41 return rates
 42 
 43 
 44 # 生成用户评分数据结构
 45 def getUserScoreDataStructure(rates):
 46 # userDict[2]=[(1,5),(4,2)].... 表示用户2对电影1的评分是5,对电影4的评分是2
 47 userDict = {}
 48 itemUser = {}
 49 for k in rates:
 50 user_rank = (k[1], k[2])
 51 if k[0] in userDict:
 52 userDict[k[0]].append(user_rank)
 53 else:
 54 userDict[k[0]] = [user_rank]
 55 
 56 if k[1] in itemUser:
 57 itemUser[k[1]].append(k[0])
 58 else:
 59 itemUser[k[1]] = [k[0]]
 60 return userDict, itemUser
 61 
 62 
 63 # 计算与指定用户最相近的邻居
 64 def getNearestNeighbor(userId, userDict, itemUser):
 65 neighbors = []
 66 for item in userDict[userId]:
 67 for neighbor in itemUser[item[0]]:
 68 if neighbor != userId and neighbor not in neighbors:
 69 neighbors.append(neighbor)
 70 neighbors_dist = []
 71 for neighbor in neighbors:
 72 dist = getCosDist(userDict[userId], userDict[neighbor])
 73 neighbors_dist.append([dist, neighbor])
 74 neighbors_dist.sort(reverse=True)
 75 return neighbors_dist
 76 
 77 
 78 # 使用UserFC进行推荐,输入:文件名,用户ID,邻居数量
 79 def recommendByUserFC(filename, userId, k=5):
 80 # 读取文件
 81 contents = readFile(filename)
 82 
 83 # 文件格式数据转化为二维数组
 84 rates = getRatingInfo(contents)
 85 # 格式化成字典数据
 86 userDict, itemUser = getUserScoreDataStructure(rates)
 87 
 88 # 找邻居
 89 neighbors = getNearestNeighbor(userId, userDict, itemUser)[:5]
 90 # 建立推荐字典
 91 recommand_dict = {}
 92 for neighbor in neighbors:
 93 neighbor_user_id = neighbor[1]
 94 movies = userDict[neighbor_user_id]
 95 for movie in movies:
 96 if movie[0] not in recommand_dict:
 97 recommand_dict[movie[0]] = neighbor[0]
 98 else:
 99 recommand_dict[movie[0]] += neighbor[0]
100 
101 # 建立推荐列表
102 recommand_list = []
103 for key in recommand_dict:
104 recommand_list.append([recommand_dict[key], key])
105 recommand_list.sort(reverse=True)
106 user_movies = [k[0] for k in userDict[userId]]
107 return [k[1] for k in recommand_list], user_movies, itemUser, neighbors
108 
109 
110 # 获取电影的列表
111 def getMovieList(filename):
112 contents = readFile(filename)
113 movies_info = {}
114 for movie in contents:
115 movie = movie.strip("\n")
116 single_info = movie.split(",")
117 movies_info[int(single_info[0])] = single_info[1], single_info[-1].split('|')
118 return movies_info
119 
120 
121 # 从这里开始运行
122 if __name__ == '__main__':
123 
124 reload(sys)
125 sys.setdefaultencoding('utf-8')
126 
127 # 获取所有电影的列表
128 movies = getMovieList("movies.csv")
129 recommend_list, user_movie, items_movie, neighbors = recommendByUserFC("ratings.csv", 50, 80)
130 neighbors_id = [i[1] for i in neighbors]
131 table = Texttable()
132 table.set_deco(Texttable.HEADER)
133 table.set_cols_dtype(['t', 't', 't'])
134 table.set_cols_align(["l", "l", "l"])
135 rows = []
136 rows.append([u"movie name"])
137 # print(len(recommend_list), user_movie, items_movie, neighbors)
138 for movie_id in recommend_list[:20]:
139 # from_user = []
140 # for user_id in items_movie[movie_id]:
141 # if user_id in neighbors_id:
142 # from_user.append(user_id)
143 print [movies[movie_id][0]]
144 rows.append([movies[movie_id][0]])
145 table.add_rows(rows)
146 print table.draw()
View Code

 

posted @ 2017-03-09 20:00  zcbmxvn987  阅读(474)  评论(0编辑  收藏  举报