用户标签

代码:用户标签

通过标签将用户和物品联系起来

标签的作用:
1.物品相关:物品的属性(时间,创作者等)
2.用户相关:用户对物品的看法、任务(待读等)
 
算法:
1.简单算法(推荐用户常用标签下的热门物品)
  计算用户对物品的喜好
  $p(u,i)=\sum_b \frac{n_{u,b}}{log(1+n_b^{(u)})}\frac{n_{i,b}}{log(1+n_i^{(u)})}$
  
# coding=gbk
import pandas as pd
import math

data=pd.read_csv('delicious.dat',sep='\t',header=None)

class SimpleTagBased:
    #{用户1:{标签A:5,...}...}
    user_tag=dict()
    #用户评价过的物品,在推荐时过滤掉这些物品
    user_item=dict()
    #每个标签对应的物品
    tag_item=dict()
    tag_count=dict()
    item_count=dict()
    item_tag=dict()
    
    def __init__(self,data):
        for user,item,tags in data.itertuples(index=False):
            if type(tags)==float:
                continue
            if user not in self.user_item:
                self.user_item[user]=list()
            self.user_item[user].append(item)
            
            if item not in self.item_count:
                self.item_count[item]=0
            self.item_count[item]+=1
            
            if item not in self.item_tag:
                self.item_tag[item]=dict()
            
            tags=tags.split(' ')
            if user not in self.user_tag:
                self.user_tag[user]=dict()
            
            for tag in tags:
                tag = tag.lower()
                if tag not in self.user_tag[user]:
                    self.user_tag[user][tag]=0
                self.user_tag[user][tag]+=1
                
                if tag not in self.tag_item:
                    self.tag_item[tag]=dict()
                if item not in self.tag_item[tag]:
                    self.tag_item[tag][item]=0
                self.tag_item[tag][item]+=1
                
                if tag not in self.tag_count:
                    self.tag_count[tag]=0
                self.tag_count[tag]+=1
                
                if tag not in self.item_tag[item]:
                    self.item_tag[item][tag]=0
                self.item_tag[item][tag]+=1
                
    def recommend(self,user):
        viewedItem=self.user_item[user]
        rank=dict()
        
        utags = self.user_tag[user]
        for tag,weight in utags.items():
            for item,wt in self.tag_item[tag].items():
                if item in viewedItem:
                    continue
                if item not in rank:
                    rank[item]=0
                #用户user对物品item的喜好程度
                rank[item]+=weight*1.0/math.log(1+self.tag_count[tag])*wt/math.log(1+self.item_count[item])
        rank=[a[0] for a in sorted(rank.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:5]];
        res = []
        for item in rank:
            #物品被打的最多的10个标签作为物品的描述
            res.append([a[0] for a in sorted(self.item_tag[item].items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:10]])
        #用户最常用的10个标签作为用户兴趣描述
        userdesc = [a[0] for a in sorted(utags.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:10]]
        return (userdesc,res)
stb = SimpleTagBased(data=data)
userdesc,res = stb.recommend(104)
print userdesc
print res

结果:

--用户常用标签

['software', 'webdesign', 'tools', 'dev', 'howto', 'free', 'freeware', 'opensource', 'reference', 'linux']

--所推荐物品具有的标签
['css', 'webdesign', 'reference', 'design', 'web', 'development', 'html', 'tools', 'webdev', 'programming'],
['webdesign', 'templates', 'design', 'css', 'opensource', 'web', 'free', 'html', 'layout', 'template'],
['fonts', 'typography', 'webdesign', 'design', 'tools', 'css', 'web', 'font', 'type', 'reference'],
['opensource', 'software', 'freeware', 'linux', 'free', 'windows', 'tools', 'reference', 'download', 'alternative'],
['freeware', 'software', 'utilities', 'tools', 'free', 'reference', 'list', 'windows', 'download', 'opensource']

  改进:
  A.打过标签少的用户(生成相似标签)
    标签相似度:(同一物品下的标签相似,两标签同时出现在不同物品下,认为标签相似度高)
    使用余弦相似度度量:$(n_{b,1},n_{b,2},n_{b,3},...,n_{b,n})$各量为物品i被打上标签b的次数
# coding=gbk
import pandas as pd
import math

data=pd.read_csv('delicious.dat',sep='\t',header=None)

item_tag=dict()
for user,item,tags in data.itertuples(index=False):
    #如果tags为nan,跳过
    if type(tags)==float:
        continue
    
    if item not in item_tag:
        item_tag[item]=dict()
    
    tags=tags.split(' ')
    for tag in tags:
        tag = tag.lower()
        if tag not in item_tag[item]:
            item_tag[item][tag]=0
        item_tag[item][tag]+=1

def recommend(taga,n,item_tag):
    nb=dict()
    nab=dict()
    na = 0
    
    l = len(item_tag)
    i=1
    for item,tags in item_tag.items():
        
        print i*1.0/l
        i+=1
        
        if taga not in tags:
            for tag,v in tags.items():
                if tag not in nb:
                    nb[tag]=0
                nb[tag]+=v*v
        else:
            av = tags[taga]
            na +=av*av
            for tag,v in tags.items():
                if tag==taga:
                    continue
                if tag not in nb:
                    nb[tag]=0
                nb[tag]+=v*v
                if tag not in nab:
                    nab[tag]=0
                nab[tag]+=av*v
    
    rank=dict()
    na = math.sqrt(na)
    for tag,v in nab.items():
        rank[tag]=v/na/math.sqrt(nb[tag])
        
    res = [a[0] for a in sorted(rank.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:n]]
    return res

tags=recommend('webdesign',10,item_tag)
print tags

结果:和webdesign相关的标签

['design', 'css', 'web', 'webdev', 'html', 'web_design', 'inspiration', 'xhtml', 'webdevelopment', 'resources']

2.基于图的算法
  
 解法参看:概率图模型
给用户推荐标签:
  对于物品i给用户u推荐标签,标签$b_k$的推荐度为:
  $(1-\alpha)\frac{n_{u,b_k}}{max(n_{u,b_j})}+\alpha\frac{n_{i,b_k}}{max(n_{i,b_j})}$
# coding=gbk
import pandas as pd
import math

data=pd.read_csv('delicious.dat',sep='\t',header=None)

item_tag=dict()
user_tag=dict()
for user,item,tags in data.itertuples(index=False):
    #如果tags为nan,跳过
    if type(tags)==float:
        continue
    
    if item not in item_tag:
        item_tag[item]=dict()
    if user not in user_tag:
        user_tag[user]=dict()
    
    tags=tags.split(' ')
    for tag in tags:
        tag = tag.lower()
        if tag not in item_tag[item]:
            item_tag[item][tag]=0
        item_tag[item][tag]+=1
        
        if tag not in user_tag[user]:
            user_tag[user][tag]=0
        user_tag[user][tag]+=1

def recommend(user,item,user_tag,item_tag,alpha):
    utgs = user_tag[user]
    itgs = item_tag[item]
    
    udesc = [a[0] for a in sorted(utgs.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:10]]
    idesc = [a[0] for a in sorted(itgs.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:10]]
    
    rank = dict()
    maxu = max(utgs.values())
    maxi = max(itgs.values())
    for tag,v in utgs.items():
        if tag not in rank:
            rank[tag] = 0
        rank[tag]+=(1-alpha)*v/maxu
    
    for tag,v in itgs.items():
        if tag not in rank:
            rank[tag] = 0
        rank[tag]+= alpha*v/maxi
    
    res=[a[0] for a in sorted(rank.items(),lambda x,y:cmp(x[1],y[1]),reverse=True)[0:10]];
    return udesc,idesc,res

udesc,idesc,res=recommend(104,33911,user_tag,item_tag,0.8)
print udesc
print idesc
print res

结果:

--用户常用标签

['software', 'webdesign', 'tools', 'dev', 'howto', 'free', 'freeware', 'opensource', 'reference', 'linux']

--物品常被打标签
['web', 'softwareagents', 'java', 'howto', 'moviles', 'documentation', 'semantica', 'hpi', 'api', 'agents']

--推荐标签
['howto', 'web', 'moviles', 'softwareagents', 'hpi', 'agents', 'api', 'jade', 'agentes', 'java']

 

posted @ 2015-04-24 08:58  porco  阅读(801)  评论(0编辑  收藏  举报