simrank python实现

1、数据

pc,hp.com
pc,hp.com
pc,hp.com
pc,hp.com
pc,hp.com
pc,hp.com
pc,hp.com
pc,hp.com
pc,hp.com
pc,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,hp.com
camera,bestbuy.com
camera,bestbuy.com
camera,bestbuy.com
camera,bestbuy.com
camera,bestbuy.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,hp.com
digital camera,bestbuy.com
digital camera,bestbuy.com
digital camera,bestbuy.com
digital camera,bestbuy.com
digital camera,bestbuy.com
digital camera,bestbuy.com
digital camera,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
tv,bestbuy.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,teleflora.com
flower,orchids.com
flower,orchids.com
flower,orchids.com
flower,orchids.com
flower,orchids.com
flower,orchids.com
flower,orchids.com
flower,orchids.com
flower,orchids.com
flower,orchids.com
flower,orchids.com
flower,orchids.com
flower,orchids.com
flower,orchids.com
flower,orchids.com
View Code

2、simrank 的python实现

import numpy as np 
from numpy import matrix

with open('sample1 (1).txt','r') as log_fp:
    logs = [log.strip() for log in log_fp.readlines()]
    # print(logs)
logs_tuple = [tuple(log.split(",")) for log in logs]
# print (logs_tuple)

queries = list(set([log[0] for log in logs_tuple]))
# print(queries)    #['digital camera', 'flower', 'pc', 'camera', 'tv']
ads = list(set([log[1] for log in logs_tuple]))
# print(ads)#['hp.com', 'teleflora.com', 'bestbuy.com', 'orchids.com']

graph = np.matrix(np.zeros([len(queries),len(ads)]))
# print(graph)   #6行4列的0矩阵

for log in logs_tuple:
    query = log[0]
    ad = log[1]
    q_i = queries.index(query)
    a_j = ads.index(ad)
    graph[q_i,a_j] +=1
print(graph)

query_sim = matrix(np.identity(len(queries)))
print(query_sim)
ad_sim = matrix(np.identity(len(ads)))
print(ad_sim)

def get_ads_num(query):
    q_i = queries.index(query)
    return graph[q_i]

def get_queries_num(ad):
    a_j = ads.index(ad)
    return graph.transpose()[a_j]

def get_ads(query):
    series = get_ads_num(query).tolist()[0]
    return [ads[x] for x in range(len(series)) if series[x] > 0]

def get_queries(ad):
    series = get_queries_num(ad).tolist()[0]
    return [queries[x] for x in range(len(series)) if series[x] > 0]


def query_simrank(q1,q2,c):
    if q1 == q2 :
        return 1
    prefix = c/(get_ads_num(q1).sum() *get_ads_num(q2).sum())
    postfix = 0
    for ad_i in get_ads(q1):
        for ad_j in get_ads(q2):
            i = ads.index(ad_i)
            j = ads.index(ad_j)
            postfix += ad_sim[i,j]
    return prefix*postfix


def ad_simrank(a1,a2,c):
    if a1 == a2 :
        return 1
    prefix = c/(get_queries_num(a1).sum()*get_queries_num(a2).sum())
    postfix = 0
    for query_i in get_queries(a1):
        for query_j in get_queries(a2):
            i = queries.index(query_i)
            j = queries.index(query_j)
            postfix += query_sim[i,j]
    return prefix*postfix


def simrank(c=0.8,times = 1):
    global query_sim,ad_sim

    for run in range(times):
        new_query_sim = matrix(np.identity(len(queries)))
        for qi in queries:
            for qj in queries:
                i = queries.index(qi)
                j = queries.index(qj)
                new_query_sim[i,j] =query_simrank(qi,qj,c)

        new_ad_sim = matrix(np.identity(len(ads)))
        for ai in ads:
            for aj in ads :
                i = ads.index(ai)
                j = ads.index(aj)
                new_ad_sim[i,j] =ad_simrank(ai,aj,c)

        query_sim = new_query_sim
        ad_sim = new_ad_sim


if __name__ == '__main__':
    print (queries)
    print(ads)
    simrank()
    print(query_sim)
    print(ad_sim)

[[15.  0.  0.  0.]
 [ 0.  0. 10.  0.]
 [ 5.  0. 20.  0.]
 [ 7.  0. 30.  0.]
 [ 0. 16.  0. 15.]]
[[
1. 0. 0. 0. 0.] [0. 1. 0. 0. 0.] [0. 0. 1. 0. 0.] [0. 0. 0. 1. 0.] [0. 0. 0. 0. 1.]]
[[
1. 0. 0. 0.] [0. 1. 0. 0.] [0. 0. 1. 0.] [0. 0. 0. 1.]]
[
'tv', 'pc', 'camera', 'digital camera', 'flower']
[
'bestbuy.com', 'teleflora.com', 'hp.com', 'orchids.com']
[[
1. 0. 0.00213333 0.00144144 0. ] [0. 1. 0.0032 0.00216216 0. ] [0.00213333 0.0032 1. 0.00172973 0. ] [0.00144144 0.00216216 0.00172973 1. 0. ] [0. 0. 0. 0. 1. ]]
[[
1.00000000e+00 0.00000000e+00 9.87654321e-04 0.00000000e+00] [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.33333333e-03] [9.87654321e-04 0.00000000e+00 1.00000000e+00 0.00000000e+00] [0.00000000e+00 3.33333333e-03 0.00000000e+00 1.00000000e+00]]

 

 

posted @ 2019-11-08 17:22  hehe哒  阅读(1520)  评论(0编辑  收藏  举报