信息检索————NDCG计算
先贴代码,原理有时间补上。
1 import numpy as np 2 import copy 3 from math import log 4 5 # input data 6 ideal = np.array([5,5,4,4,3,3,2,2,1,1]) 7 relevant_score_list = np.array([5.0,5.0,4.0,4.0,3.0,3.0,2.0,2.0,1.0,1.0]) 8 recall_list = np.array([8,11,2,12,3,1,5,4,13,7]) 9 10 def dcg(rscore, m): 11 return (2.0 ** rscore - 1.0) / np.log2(2.0 + m) 12 13 def dcg_k(rslist,rlist,k): 14 # 计算第k个dcg值 15 dcgscore_list = [0.0 for i in range(11)]##计算1,2,...,k的dcg值 16 for i in range(k): 17 relevant_score = 0 18 if rlist[i] < len(rslist): 19 relevant_score = rslist[rlist[i]-1] 20 dcgscore_list[i+1] = dcgscore_list[i] + dcg(relevant_score, i) 21 return dcgscore_list; 22 23 def idcg_k(ideal,k): 24 # idcg是一个系统最理想情况下的返回结果排序,也就是一个案例中所给的最佳排序 25 idcgscore_list = [0.0 for i in range(11)] #计算1,2,...,k的idcg值 26 for i in range(k): 27 idcgscore_list[i+1] = idcgscore_list[i] + dcg(ideal[i], i) 28 return idcgscore_list; 29 30 def ndcg(rslist,rlist,k): 31 dcgscore_list = dcg_k(rslist,rlist,k) 32 # 计算归一化因子z, 最完美情况应该是recall结果按照相关性分数降序排列 33 idcgscore_list = idcg_k(ideal,k) 34 35 ndcg_list = [0.0 for i in range(k)] 36 for i in range(k): 37 ndcg_list[i] = round(dcgscore_list[i+1] / idcgscore_list[i+1], 3) 38 print(ndcg_list) 39 40 41 def ndcg(ideal, rlist, k): 42 recall = [0.0 for i in range(k)] 43 for i in range(k): 44 if rlist[i] <= len(ideal): 45 recall[i] = ideal[rlist[i]-1] 46 47 dcg, dcg_max, ndcg = 0.0, 0.0, 0.0 48 for i, (true, predict) in enumerate(zip(ideal, recall)): 49 dcg += (2 ** predict - 1) / log(2 + i) 50 dcg_max += (2 ** true -1) / log(2 + i) 51 ndcg = dcg / dcg_max 52 return ndcg 53 54 # print(ndcg(relevant_score_list,recall_list,10)) 55 # ndcg(relevant_score_list,recall_list,10) 56 ndcg(ideal, recall_list, 10)
代码中两种方式实现了ndcg的计算,调用的时候需要注意一下。