MovieLens数据挖掘练习
第一次写数据挖掘方面的程序,只是一个简单的练习,对大数据处理表示无能...
程序采用Item-Based Collaborative Filtering(基于项目的协同过滤算法)产生推荐。首先将训练集数据读入内存,进行分析。再读取每一条测试集数据进行判断:
- 如果该数据测试user对测试item已有评价,则直接给出该评价值
- 如果该测试user不存在,判断:若测试item存在则,给出所有user对该item的平均分,若item也不存在,直接得到1和5的均值3。
- 如果测试item不存在,测试user存在,则取该用户对其他item的评分的均值作为对该项目的评分。
- 如果测试user和测试item均存在,且测试user对测试的item没有过评价,则采用Adjusted Cosine Similarity算出该项目与其他项目的相似度,取前n个最近邻,然后采用推荐函数得到评分。
最后将每一条得到的预测和真实值对比得到mae值。
产生推荐的算法思路:
通过相似性度量产生测试item的最近邻居,然后产生相应推荐。设item的最近邻集合用NBSu表示,某user对某item 的预测评分用 P 可以通过item最近邻集合中user 的评分得到:
其中sim(u,n)表示item u和item n之间的相似性,Rn,i表示item n被user i的评分,"Rx拔"表示项目x得到的平均得分。
这种推荐比的效果要好得多。
#include <iostream> #include <fstream> #include <sstream> #include <map> #include <string> #include<math.h> #include <list> #include <stdio.h> #include<ctime> using namespace std; const int K_NEIGHBORS=20; //ITERM的最近邻居数 typedef map<int, double> RatingMap;//定义一个模版类 //定义一个User结构,包括平均评分,和评过的电影map struct User { double average_rating; RatingMap rating_map; }; struct Film { double average_rating; RatingMap rating_map; }; //定义一个TestUser结构,在测试集中使用,包括用户id,电影id,真实评分,预测评分 struct TestUser { int user_id; int film_id; double real_rating; double predict_rating; }; struct SimilarFilm { Film film; double similarity; }; typedef map<int ,Film> FilmMap; //存放用户的map模版类 typedef list<TestUser>TestList; //存放测试的list模板类 typedef map<int,User>UserMap; double filmSimilarity(Film test_film, Film train_film); double predictRating(Film test_film, int user_id, SimilarFilm *similar_film, int num); int main() { time_t begin,end; begin=clock(); FilmMap filmMap; UserMap userMap; //格式读取文件 ifstream train_file("train.txt"); if(!train_file) { cerr<<"error:unable to open file "<<"train.txt"<<endl; return -1; } string line; while(getline(train_file,line)) { string str1,str2,str3; istringstream strstm(line); strstm>>str1>>str2>>str3; int uid = atoi(str1.c_str()); int fid=atoi(str2.c_str()); double rating=atof(str3.c_str()); FilmMap::iterator film = filmMap.find(fid); UserMap::iterator user = userMap.find(uid); if(film == filmMap.end()) { Film film_temp; film_temp.average_rating = 0; film_temp.rating_map.insert(pair<int,double>(uid,rating)); filmMap.insert(pair<int,Film>(fid,film_temp)); } else { film->second.rating_map.insert(pair<int,double>(uid,rating)); } if(user == userMap.end()) { User user_temp; user_temp.average_rating = 0; user_temp.rating_map.insert(pair<int,double>(fid,rating)); userMap.insert(pair<int,User>(uid,user_temp)); } else { user->second.rating_map.insert(pair<int,double>(fid,rating)); } line.clear(); } cout<<"Read file complete."<<endl; train_file.close(); //处理文件 for(UserMap::iterator user = userMap.begin(); user !=userMap.end(); user++) { int count = 0; double sum = 0; for (RatingMap::iterator rating = user->second.rating_map.begin(); rating != user->second.rating_map.end(); rating++) { sum += rating->second; count++; } user->second.average_rating = sum/count; } for(FilmMap::iterator film = filmMap.begin(); film !=filmMap.end(); film++) { int count = 0; double sum = 0; for (RatingMap::iterator rating = film->second.rating_map.begin(); rating != film->second.rating_map.end(); rating++) { sum += rati dng->second; count++; } film->second.average_rating = sum/count; } cout<<"Process data complete."<<endl; //读取test文件 ifstream test_file("test.txt"); TestList testList; TestUser test_user; if(!test_file) { cerr<<"error:unable to open file "<<"test.txt"<<endl; return -1; } string test_line; while(getline(test_file,test_line)) { string str4,str5,str6; istringstream str_stem(test_line); str_stem>>str4>>str5>>str6; int uid =atoi(str4.c_str()); int fid=atoi(str5.c_str()); double rating=atof(str6.c_str()); test_user.user_id = uid; test_user.film_id = fid; test_user.real_rating = rating; testList.push_back(test_user); test_line.clear(); } test_file.close(); for (TestList::iterator test_item = testList.begin(); test_item != testList.end(); test_item++) { FilmMap::iterator testFilmIt = filmMap.find(test_item->film_id); UserMap::iterator testUserIt = userMap.find(test_item->user_id); FilmMap::iterator trainFilmIt; RatingMap::iterator test_rating; if(testFilmIt != filmMap.end()) { //电影存在,用户存在 if(testUserIt != userMap.end()) { //用户看过的电影量太少 if(testUserIt->second.rating_map.size()>2) { SimilarFilm similarFilm[K_NEIGHBORS] = {0}; int num = 0; int min = 0; int flag = 0; for(test_rating = testUserIt->second.rating_map.begin(); test_rating != testUserIt->second.rating_map.end();test_rating++) { trainFilmIt = filmMap.find(test_rating->first); double similarity = filmSimilarity((testFilmIt->second), (trainFilmIt->second)); if(similarity>0) { if(num< K_NEIGHBORS) { similarFilm[num].film = trainFilmIt->second; similarFilm[num].similarity = similarity; num++; } else { if(flag ==0) { for (int i=1; i<num; i++) { if (similarFilm[i].similarity < similarFilm[min].similarity) { min = i; } } flag = 1; } else { if (similarity > similarFilm[min].similarity) { similarFilm[min].film = trainFilmIt->second; similarFilm[min].similarity = similarity; } } } } } test_item->predict_rating = predictRating((testFilmIt->second), (test_item->user_id), similarFilm, num); } else { test_item->predict_rating = (testFilmIt->second.average_rating + testUserIt->second.average_rating)/2; } } //电影存在,用户不存在 else { test_item->predict_rating = testFilmIt->second.average_rating; } } else { //电影不存在,用户存在 if(testUserIt != userMap.end()) { test_item->predict_rating = testUserIt->second.average_rating; } //用户不存在,电影也不存在时 else { test_item->predict_rating = 3.5; } } } cout<<"Load TestSet Complete. Predict over."<<endl; ofstream output("output.txt"); double MAE = 0; double RMSE = 0; output<<"用户\t"<<"电影\t"<<"实际评分\t"<<"预测评分\t"<<endl; //计算完毕,开始输出与测试数据,并进行估算 for (TestList::iterator it = testList.begin(); it != testList.end(); it++) { //确保预测值是0.5的倍数 if (it->predict_rating > 5.0) it->predict_rating = 5.0; MAE += fabs(it->predict_rating - it->real_rating); RMSE += pow((it->predict_rating - it->real_rating),2); output<<it->user_id<<"\t"<<it->film_id<<"\t"<<it->real_rating<<"\t"<<" "<<it->predict_rating<<endl; } MAE /= testList.size(); RMSE = sqrt(RMSE/testList.size()); end = clock(); output<<"\nMAE: "<<MAE<<"\n"<<"RMSE: "<<RMSE<<"\n"<<"time: "<<double(end-begin)/CLOCKS_PER_SEC<<endl; cout<<"Output finish!"<<endl; return 0; } double filmSimilarity(Film test_film, Film train_film) { int sum = test_film.rating_map.size(); double *test_rating = new double[sum]; double *train_rating = new double[sum]; double test_rating_sum = 0; double train_rating_sum = 0; double test_average = 0; double train_average = 0; RatingMap::iterator test_user = test_film.rating_map.begin(); RatingMap::iterator train_user = train_film.rating_map.begin(); int index = 0; int test_size = test_film.rating_map.size(); int train_size = train_film.rating_map.size(); int test=0; int train = 0; for(;test<test_size &&train<train_size;) { if(test_user->first == train_user->first) { test_rating[index] = test_user->second; train_rating[index] = train_user->second; test_rating_sum += test_user->second; train_rating_sum += train_user->second; test_user++; train_user++; index++; test++; train++; } else if(test_user->first < train_user->first) { test_user++; test++; } else { train_user++; train++; } } test_average = test_rating_sum / (index+1); train_average = train_rating_sum / (index+1); double person1 = 0; double person2 = 0; double person3 = 0; for(int i=0; i<index; i++) { person1 += (test_rating[i] - test_average) * (train_rating[i] - train_average); person2 += pow((train_rating[i] - train_average),2); person3 += pow((train_rating[i] - train_average),2); } delete []test_rating; delete []train_rating; if (fabs(person2) <= 0.00001 || fabs(person3) <= 0.00001){ return 0; } return person1 / sqrt(person2)* sqrt(person3); } double predictRating(Film test_film, int user_id, SimilarFilm *similar_film, int num) { double top_sum = 0; double down_sum = 0; RatingMap::iterator ratingIt; for(int i = 0;i < num;i++) { ratingIt = similar_film[i].film.rating_map.find(user_id); if(ratingIt != similar_film[i].film.rating_map.end()) { top_sum += similar_film[i].similarity * (ratingIt->second - similar_film[i].film.average_rating); } down_sum += fabs(similar_film[i].similarity); } if(down_sum < 0.00001) return test_film.average_rating; int test = test_film.average_rating + top_sum / down_sum; return ((test+0.25)/0.5) *0.5; }
好吧最后- -求鞭挞吧..