推荐算法-协同过滤
最近看了一篇协同过滤的文章"A Guide to Singular Value Decomp osition for Collaborative Filtering",主要为协同过滤设计了一种有效的svd算法,
V是偏好分数矩阵,I{i,j}=1代表用户i对item j 有偏好,否则为I{i, j}=0,一般V是稀疏的。协同过滤的目的是预测稀疏矩阵中没有展现的评论分数,协同过滤算法一种普遍的评价方法是Root Mean Square Error(RMSE),预测矩阵P,真实矩阵为A,且J为预测的A的指示器,对应I的定义,则RMSE定义为:
SVD算法的目的是找到两个特征矩阵U(用户*特征) ,M(item*特征)
代码如下:
#include<iostream> #include<string> #include<fstream> #include<math.h> using namespace std; const int USERMAX = 1000; const int ITEMMAX = 2000; const int FEATURE = 50; const int ITER_MAX = 30; double rating[USERMAX][ITEMMAX]; int I[USERMAX][ITEMMAX]; //indicate if the item is rated double UserF[USERMAX][FEATURE]; double ItemF[ITEMMAX][FEATURE]; double BIASU[USERMAX]; double BIASI[ITEMMAX]; double lambda = 0.15; double gamma = 0.05; double mean; double predict(int i, int j) { double rate = mean + BIASU[i] + BIASI[j]; for(int f = 0; f < FEATURE; f++) { rate += UserF[i][f] * ItemF[j][f]; } if(rate < 1) { rate = 1; } else if (rate > 5) { rate = 5; } return rate; } double calRMSE() { double total = 0; int cnt = 0; for(int i = 0; i < USERMAX; i++) { for(int j = 0; j < ITEMMAX; j++) { double rate = predict(i, j); total += I[i][j] * (rating[i][j] - rate)* (rating[i][j] - rate); cnt += I[i][j]; } } double rmse = pow(total/cnt, 0.5); return rmse; } double calMean() { double total = 0; int cnt = 0; for (int i = 0; i < USERMAX; ++i) { for(int j = 0; j < ITEMMAX; ++j) { total += I[i][j] * rating[i][j]; cnt += I[i][j]; } } return total/cnt; } void initBias() { memset(BIASU, 0, sizeof(BIASU)); memset(BIASI, 0, sizeof(BIASI)); mean = calMean(); for(int i = 0; i < USERMAX; i++) { double total = 0; int cnt = 0; for(int j = 0; j < ITEMMAX; j++) { if(I[i][j]) { total += rating[i][j] - mean; cnt++; } } if(cnt > 0) { BIASU[i] = total/cnt; } else { BIASU[i] = 0; } } for(int j = 0; j < ITEMMAX; j++) { double total = 0; int cnt = 0; for(int i = 0; i < USERMAX; i++) { if(I[i][j]) { total += rating[i][j] - mean; cnt++; } } if(cnt > 0) { BIASI[j] = total/cnt; } else { BIASI[j] = 0; } } } void train() { memset(rating, 0, sizeof(rating)); memset(I, 0, sizeof(I)); ifstream in("D:\\dataset\\ml-100k\\ub.base",ios::in); if(!in) { cout << "file not exist" << endl; exit(1); } int userId, itemId, rate; string timeStamp; while(in >> userId >> itemId >> rate >> timeStamp) { rating[userId][itemId] = rate; I[userId][itemId] = 1; } initBias(); //train matrix decomposation /*for (int i = 0; i < USERMAX; ++i) { for(int f = 0; i < FEATURE; i++) { UserF[i][f] = (rand()%10)/10.0; } } for (int j = 0; j < ITEMMAX; ++j) { for (int f = 0; f < FEATURE; ++f) { ItemF[j][f] = (rand()%10)/10.0; } }*/ //初始化 for(int f = 0; f < FEATURE; f++) { for(int i = 0; i < USERMAX; i++) { UserF[i][f] = (rand()%100)/100.0 + 0.001; } for(int j = 0; j < ITEMMAX; j++) { ItemF[j][f] = (rand()%100)/100.0 + 0.001; } } int iterCnt = 0; while(iterCnt < ITER_MAX) { for(int i = 0; i < USERMAX; i++) { for(int j = 0; j < ITEMMAX;j++) { if(I[i][j]) { double predictRate = predict(i, j); double eui = rating[i][j] - predictRate; BIASU[i] += gamma*(eui - lambda*BIASU[i]); BIASI[j] += gamma*(eui - lambda*BIASI[j]); for(int f = 0; f < FEATURE; f++) { UserF[i][f] += gamma*(eui*ItemF[j][f] - lambda*UserF[i][f]); ItemF[j][f] += gamma*(eui*UserF[i][f] - lambda*ItemF[j][f]); } } } } double rmse = calRMSE(); cout << "LOOP" << iterCnt << ": rmse is " << rmse << endl; iterCnt++; } } void test() { ifstream in("D:\\dataset\\ml-100k\\ub.test"); int userId, itemId, rate; string timeStamp; double total = 0; int cnt = 0; while(in >> userId >> itemId >> rate >> timeStamp) { double predictRate = predict(userId, itemId); total += (rate - predictRate) * (rate - predictRate); cnt++; } double rmse = pow(total / cnt, 0.5); cout << "test: rmse is " << rmse << endl; } int main() { train(); test(); return 0; }