基于用户相似性的协同过滤——Python实现
代码基本来自项亮的<推荐系统实践>,把书上的伪代码具体实现,还参考了https://www.douban.com/note/336280497/
还可以加入对用户相似性的归一化操作,效果会更好。
数据集为MovieLens的10万条数据.
链接:MoiveLens
#coding:utf-8 import random,math from operator import itemgetter class UserBasedCF: def __init__(self,trainDataFile=None,testDataFile=None,splitor='\t'): if trainDataFile!=None: self.train=self.loadData(trainDataFile, splitor) if testDataFile!=None: self.test=self.loadData(testDataFile, splitor) self.simiMatrix={} def setData(self,train,test): self.train=train self.test=test def loadData(self,dataFile,splitor='\t'): data={} for line in open(dataFile): user,item,record,_ = line.split() data.setdefault(user,{}) data[user][item]=record return data def recallAndPrecision(self,peersCount,topN=10): hit=0 recall=0 precision=0 for user in self.train.keys(): itemOfuser=self.test.get(user,{}) recItems=self.recommend(user,peersCount,topN) for item,pui in recItems.items(): if item in itemOfuser: hit+=1 recall+=len(itemOfuser) precision+=topN #print 'Recall:%s hit:%s allRatings:%s'%(hit/(recall*1.0),hit,precision) return (hit / (recall * 1.0),hit / (precision * 1.0)) def coverage(self,peersCount,topN=10): recommend_items=set() all_items=set() for user in self.train.keys(): for item in self.train[user].keys(): all_items.add(item) rank=self.recommend(user,peersCount,topN) for item,pui in rank.items(): recommend_items.add(item) return len(recommend_items)/(len(all_items)*1.0) def popularity(self,peersCount,topN=10): item_popularity=dict() for user,items in self.train.items(): for item in items.keys(): if item not in item_popularity: item_popularity[item]=1 item_popularity[item]+=1 ret=0 n=0 for user in self.train.keys(): rank=self.recommend(user,peersCount,topN) for item,pui in rank.items(): ret+=math.log(1+item_popularity[item]) n+=1 return ret/(n*1.0) def calUserSimilarity(self): item_users=dict() for u,ratings in self.train.items(): for i in ratings.keys(): item_users.setdefault(i,set()) item_users[i].add(u) #calculate co-rated items between users coRatedCount=dict() itemCountOfUser=dict() for item,users in item_users.items(): for u in users: itemCountOfUser.setdefault(u,0) itemCountOfUser[u]+=1 for v in users: if u==v: continue coRatedCount.setdefault(u,{}) coRatedCount[u].setdefault(v,0) coRatedCount[u][v]+=1/math.log(1+len(users)) userSimiMatrix=dict() for u,related_users in coRatedCount.items(): userSimiMatrix.setdefault(u,{}) for v,cuv in related_users.items(): userSimiMatrix[u][v]=cuv/math.sqrt(itemCountOfUser[u]*itemCountOfUser[v]) self.simiMatrix=userSimiMatrix def recommend(self,userU,peersCount,topN=10): recItems=dict() interacted_items=self.train[userU] '''prepare the user similarity matrix first''' if not self.simiMatrix: self.calUserSimilarity() for userV,simiUV in sorted(self.simiMatrix[userU].items(),key=itemgetter(1),reverse=True)[0:peersCount]: for item,ratingV4I in self.train[userV].items(): if item in interacted_items: continue if item not in recItems: recItems[item]=0 recItems[item]+=simiUV*float(ratingV4I)#transform 4 stars into score 0.8 '''if len(recItems)==topN: return recItems''' return dict(sorted(recItems.items(),key = lambda x :x[1],reverse = True)[0:topN]) def testUserBasedCF(): cf=UserBasedCF(trainDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u3.base',testDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u3.test') #cf.calUserSimilarity() print("%3s%15s%15s%15s%15s" % ('K',"precision",'recall','coverage','popularity')) for k in [5,10,20,40,80,160]: recall,precision = cf.recallAndPrecision(peersCount = k) coverage = cf.coverage(peersCount = k) popularity = cf.popularity(peersCount = k) print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f" % (k,precision * 100,recall * 100,coverage * 100,popularity)) def SplitData(wholeData,M,k,seed,splitor='\t'): test={} train={} random.seed(seed) for line in wholeData: user,item,score,time=line.strip().split(splitor) if random.randint(0,M)==k: test.setdefault(user,{}) test[user][item]=score else: train.setdefault(user,{}) train[user][item]=score return train,test def testUserBasedCF2(): wholeData=open(r'E:\ResearchAndPapers\DataSet\ml-1m\ratings.dat') train,test=SplitData(wholeData, 8, 5, 10, splitor='::') cf=UserBasedCF() cf.setData(train, test) #cf=UserBasedCF(trainDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u5.base',testDataFile=r'E:\ResearchAndPapers\DataSet\ml-100k\u5.test') #cf.calUserSimilarity() print("%3s%15s%15s%15s%15s" % ('K',"precision",'recall','coverage','popularity')) for k in [5,10,20,40,80,160]: recall,precision = cf.recallAndPrecision(peersCount = k) coverage = cf.coverage(peersCount = k) popularity = cf.popularity(peersCount = k) print("%3d%14.2f%%%14.2f%%%14.2f%%%15.2f" % (k,precision * 100,recall * 100,coverage * 100,popularity)) if __name__=="__main__": testUserBasedCF() #testUserBasedCF2()