数据挖掘之关联规则Apriori算法

/*
关联规则Apriori算法思想: 假如有一个大小为20的样本,每个样本包含I1,I2,I3,I4,I5其中的某些属性,假如A样本为{I1,I2,I3},B样本为{I1,I2,I4}之类的,设置最小支持度为2,即选择出来的频繁项集的所有属性至少有2个样本全部包含,比如频繁项集{I1,I2},此时A和B都包含,当然其他的样本也可能包含。那么这个频繁项集是符合的,那么我们可以认为I1和I2的关联性是
比较强的。直到找到大小最大的频繁项集。
*/
#include<cstdio> #include<iostream> #include<string> #include<vector> #include<cstring> #include<map> #include<algorithm> using namespace std; #define UINT_MAX 100 class Apriori{ public: Apriori(size_t is=0,unsigned int mv=0){ item_size=is; min_value=mv; } ~Apriori(){} void getItem(); /**求事务的频繁项**/ map<vector<string>,unsigned int> find_freitem(); /**连接两个k-1级频繁项,得到第k级频繁项**/ map<vector<string>,unsigned int> apri_gen(unsigned int K,map<vector<string>,unsigned int> K_item); /**展示频繁项集**/ void showAprioriItem(unsigned int K,map<vector<string>,unsigned int> showmap); private: map<int,vector<string> > item; /**存储所有最开始的事务及其项**/ map<vector<string>,unsigned int> K_item; /**存储频繁项集**/ size_t item_size; /**事务数**/ unsigned int min_value; /**最小支持度**/ }; void Apriori::getItem(){ int ci=item_size; for(int i=0;i<ci;i++){ string str; vector<string> temp; cout<<"请输入第"<<i+1<<"个事务的项集(123 end):"; while(cin>>str&&str!="123"){ temp.push_back(str); } sort(temp.begin(),temp.end()); pair< map<int,vector<string> >::iterator,bool> ret=item.insert(make_pair(i+1,temp)); if(!ret.second){ /**插入失败**/ --i; cout<<"你输入的元素已存在!请重新输入!"<<endl; } } cout<<"--------------运行结果如下:--------------"<<endl; } map<vector<string>,unsigned int> Apriori::find_freitem(){ unsigned int i=1; bool isEmpty=false; map<int,vector<string> >::iterator mit; for(mit=item.begin();mit!=item.end();mit++){ vector<string> vec=mit->second; if(vec.size()!=0) break; /**找到第一个非空的事务**/ } if(mit==item.end()){ isEmpty=true; cout<<"事务集为空!程序无法进行..."<<endl; map<vector<string>,unsigned int> empty; return empty; } while(1){ map<vector<string>,unsigned int> K_itemTemp=K_item; K_item=apri_gen(i++,K_item); if(K_itemTemp==K_item){ /****/ i=UINT_MAX; break; } map<vector<string>,unsigned int> pre_K_item=K_item; size_t Kitemsize=K_item.size(); if(Kitemsize!=1&&i!=1){ vector<map<vector<string>,unsigned int>::iterator> eraseVecMit; map<vector<string>,unsigned int>::iterator pre_K_item_it1=pre_K_item.begin(),pre_K_item_it2; while(pre_K_item_it1!=pre_K_item.end()){ map<vector<string>,unsigned int>::iterator mit=pre_K_item_it1; bool isExist=true; vector<string> vec1; vec1=pre_K_item_it1->first; vector<string> vec11(vec1.begin(),vec1.end()-1);/**除了最后一项**/ while(mit!=pre_K_item.end()){ vector<string> vec2; vec2=mit->first; vector<string> vec22(vec2.begin(),vec2.end()-1); if(vec11==vec22) break; /**找到第一个相等的**/ ++mit; } if(mit==pre_K_item.end()) isExist=false; /**不存在**/ if(!isExist&&pre_K_item_it1!=pre_K_item.end()) /**该项可以删除**/ eraseVecMit.push_back(pre_K_item_it1); ++pre_K_item_it1; /**继续下一个**/ } size_t eraseSetSize=eraseVecMit.size(); if(eraseSetSize==Kitemsize) break; /**频繁项集全部应该被删除,则退出**/ else{ vector<map<vector<string>,unsigned int>::iterator >::iterator currentErs=eraseVecMit.begin(); while(currentErs!=eraseVecMit.end()){ /**删除应该删除的项集**/ map<vector<string>,unsigned int>::iterator eraseMit=*currentErs; K_item.erase(eraseMit); ++currentErs; } } } else if(Kitemsize==1) break;/**只剩下一个频繁项集也退出**/ } cout<<endl; showAprioriItem(i,K_item); return K_item; } map<vector<string>,unsigned int> Apriori::apri_gen(unsigned int K,map<vector<string>,unsigned int> K_item){ if(1==K){ /**候选集C1**/ size_t c1=item_size; map<int,vector<string> >::iterator mapit=item.begin(); vector<string> vec; map<string,unsigned int> c1_itemtemp; while(mapit!=item.end()){ vector<string> temp=mapit->second; /**事务中的项目**/ vector<string>::iterator vecit=temp.begin(); while(vecit!=temp.end()){ pair<map<string,unsigned int>::iterator,bool> ret=c1_itemtemp.insert(make_pair(*vecit++,1)); if(!ret.second){ ++ret.first->second; /**该项目出现的个数加1**/ } } ++mapit; } map<string,unsigned int>::iterator item_it=c1_itemtemp.begin(); map<vector<string>,unsigned int> c1_item; while(item_it!=c1_itemtemp.end()){ vector<string> temp; if(item_it->second>=min_value){ /**大于最小支持度的才加入到1-频繁项目集**/ temp.push_back(item_it->first); c1_item.insert(make_pair(temp,item_it->second)); } ++item_it; } return c1_item; } else{ cout<<endl; showAprioriItem(K-1,K_item); /**显示(k-1)-频繁项目集**/ map<vector<string>,unsigned int>::iterator ck_item_it1=K_item.begin(),ck_item_it2; map<vector<string>,unsigned int> ck_item; while(ck_item_it1!=K_item.end()){ ck_item_it2=ck_item_it1; ++ck_item_it2; map<vector<string>,unsigned int>::iterator mit=ck_item_it2; while(mit!=K_item.end()){ vector<string> vec,vec1,vec2; vec1=ck_item_it1->first; vec2=mit->first; vector<string>::iterator vit1,vit2; vit1=vec1.begin(); vit2=vec2.begin(); while(vit1<vec1.end()&&vit2<vec2.end()){ string str1=*vit1; string str2=*vit2; ++vit1; ++vit2; if(K==2||str1==str2){ if(vit1!=vec1.end()&&vit2!=vec2.end()){ vec.push_back(str1); } } else break; } if(vit1==vec1.end()&&vit2==vec2.end()){ /**前K-1项相同**/ --vit1; --vit2; string str1=*vit1; string str2=*vit2; if(str1>str2){ /**再插入**/ vec.push_back(str2); vec.push_back(str1); } else{ vec.push_back(str1); vec.push_back(str2); } map<int,vector<string> >::iterator base_item=item.begin(); unsigned int Acount=0; while(base_item!=item.end()){ unsigned int count=0,mincount=UINT_MAX; vector<string> vv=base_item->second; vector<string>::iterator vecit,bvit; for(vecit=vec.begin();vecit<vec.end();vecit++){ string t=*vecit; count=0; for(bvit=vv.begin();bvit<vv.end();bvit++){ if(t==*bvit) count++; } mincount=(count<mincount?count:mincount); } if(mincount>=1&&mincount!=UINT_MAX) /**该项集是该事务的子集**/ Acount+=mincount; ++base_item; } if(Acount>=min_value&&Acount!=0){ /**大于等于最小支持度**/ sort(vec.begin(),vec.end()); pair<map<vector<string>,unsigned int>::iterator,bool> ret=ck_item.insert(make_pair(vec,Acount)); if(!ret.second){ ret.first->second+=Acount; } } } ++mit; } ++ck_item_it1; } if(ck_item.empty()) return K_item; else return ck_item; } } void Apriori::showAprioriItem(unsigned int K,map<vector<string>,unsigned int> showmap){ map<vector<string>,unsigned int>::iterator showit=showmap.begin(); if(K!=UINT_MAX) cout<<endl<<""<<K<<" 级频繁项集为:"<<endl; else cout<<"最终的频繁项集为:"<<endl; cout<<"项 集"<<" \t "<<"频率"<<endl; while(showit!=showmap.end()){ vector<string> vec=showit->first; vector<string>::iterator vecit=vec.begin(); cout<<"{"; while(vecit!=vec.end()){ cout<<*vecit<<" "; ++vecit; } cout<<"}"<<" \t "; cout<<showit->second<<endl; ++showit; } } unsigned int parseNumber(const char *str){ if(str==NULL) return 0; else{ unsigned int num=0; size_t len=strlen(str); for(size_t i=0;i<len;i++){ num*=10; if(str[i]>='0'&&str[i]<='9') num+=str[i]-'0'; else return 0; } return num; } } int main(){ /* unsigned int itemsize=0; unsigned int min; do{ cout<<"请输入事务数:"; char *str=new char; cin>>str; itemsize=parseNumber(str); //事务数 if(itemsize==0){ cout<<"请输入大于0正整数!"<<endl; } }while(itemsize==0); do{ cout<<"请输入最小阈值:"; char *str=new char; cin>>str; min=parseNumber(str); //最小支持度 if(min==0){ cout<<"请输入大于0正整数!"<<endl; } }while(min==0); Apriori a(itemsize,min); a.getItem(); map<vector<string>,unsigned int> AprioriMap=a.find_freitem(); //找到频繁项目集 a.showAprioriItem(UINT_MAX,AprioriMap); */ return 0; }

 

posted @ 2017-03-29 22:08  wust_ouyangli  阅读(617)  评论(0编辑  收藏  举报