程序代码参考了csdn某博客,具体名字忘记了
变量命名的头文件
//common.h #ifndef COMM_H #define COMM_H #include <iostream> #include <vector> #include <string> #include <algorithm> #include <iterator> using namespace std; typedef vector<string> StrVec; //字符串向量 typedef vector<int> IntVec; //整数向量 typedef vector<vector<int> > Int2DVec;//整数二维向量 typedef vector<vector<double>> Double2DVec;//浮点数二维向量 typedef vector<double> DoubleVec;//浮点数向量 #endif
去除停用词语
#pragma once #include "common.h" // 用于移除停止词 class StopWordsHandler { public: StopWordsHandler(void); ~StopWordsHandler(void); bool IsStopWord(string& str); private: StrVec stopwords; }; #include "StopWordHandler.h" string StopWordList[] = {"的", "我们","要","自己","之","将","“","”",",","(",")","后","应","到","某","后","个","是","位","新","一","两","在","中","或","有","更","好",""};//停用词 int strwordlen = sizeof(StopWordList) / sizeof(StopWordList[0]); StopWordsHandler::StopWordsHandler() { for ( int i = 0 ; i < strwordlen ; i++) stopwords.push_back(StopWordList[i]); } StopWordsHandler::~StopWordsHandler() { } bool StopWordsHandler::IsStopWord(string& str) { transform(str.begin(),str.end(),str.begin(),tolower);//确保小写化 return find(stopwords.begin(),stopwords.end(),str)!=stopwords.end(); }
分词选用了最简单的分词方法,预先用空格做好了分词
#pragma once #include "Common.h" class ITokeniser { public: virtual void Partition(string input,StrVec& retWords)=0;//分词算法 }; #pragma once #include "Itokenisher.h" class Tokeniser :public ITokeniser { public: Tokeniser(); ~Tokeniser(); void Partition(string input , StrVec& retWords); }; #include "Tokeniser.h" #include "StopWordHandler.h" #include <iterator> Tokeniser::Tokeniser() { } Tokeniser::~Tokeniser() { } void Tokeniser::Partition(string input ,StrVec& retWord) { transform(input.begin() , input.end(),input.begin(),tolower); string::iterator pos = input.begin(); StopWordsHandler stopHandler; do { string temp; pos = find(input.begin() , input.end(),' '); copy(input.begin() , pos ,back_inserter(temp)); if ( !stopHandler.IsStopWord(temp)) retWord.push_back(temp); if ( pos == input.end()) break; else input.erase(input.begin() ,++pos); }while ( pos != input.end()); }
TFIDF的计算
#pragma once #include "Itokenisher.h" #include <map> class TFIDFMeasure { private: StrVec _docs; //文档集合 , 每一行字符串代表一个文档 int _numDocs; //文档数目 int _numTerms;//单词数目 StrVec _terms;//单词集合 Int2DVec _termFreq ;//每个单词出现在每份文档的频率 Double2DVec _termWeight;//每个单词在每份文档的权重 IntVec _maxTermFreq ;//记录每份文档的最大词频 IntVec _docFreq;//出现这个单词的文档频率 ITokeniser* _tokeniser;//分词器 map<string , int > _wordIndex;//单词映射表 public : TFIDFMeasure(const StrVec& document , ITokeniser * tokeniser); ~TFIDFMeasure(); inline int NumTerm( ) const { return this->_numTerms; } void GetTermVector(int doc , DoubleVec& vec);//获取项向量 protected: void init();//初始化tf-idf计数 void GenerateTerms(const StrVec& ,StrVec& terms);//分词处理 void GenerateTermFrequency();//计算词频 void GenerateTermWeight();//计算词的权重 void GetWordFrequency( string & input ,map<string,int> &freq); int CountWords(string& word ,const StrVec& words); int GetTermIndex(const string& term);//查询词语对应的下标 double ComputeTermWeight(int term ,int doc);//计算词语在制定文档的频率 double GetTermFrequency(int term , int doc);//获取词语在文档的频率 double GetInverseDoucumentFrequency(int term); //计算逆文档频率 }; #include "TF_IDF.h" TFIDFMeasure::~TFIDFMeasure() { if (this->_tokeniser != NULL) { delete _tokeniser; _tokeniser = NULL; } _docs.clear(); _terms.clear(); _wordIndex.clear(); } TFIDFMeasure::TFIDFMeasure(const StrVec& document , ITokeniser * tokeniser ) { _docs = document; _numDocs = document.size(); _tokeniser = tokeniser; this->init(); } void TFIDFMeasure::init() { //初始化 this->GenerateTerms(_docs,_terms); //分词 this->_numTerms = _terms.size(); //所有文档中的词项数目 //申请空间 _maxTermFreq.resize(_numDocs); _docFreq.resize(_numTerms); _termFreq.resize(_numTerms); _termWeight.resize(_numTerms); for (int i = 0 ; i < _terms.size() ; i++) { _termWeight[i].resize(_numDocs); _termFreq[i].resize(_numDocs); _wordIndex[_terms[i]] = i; //将单词放入单词映射表中 } this->GenerateTermFrequency(); this->GenerateTermWeight(); } void TFIDFMeasure::GenerateTerms(const StrVec& docs ,StrVec &terms) { for (int i = 0 ; i < docs.size() ; i++) { StrVec words; _tokeniser->Partition(docs[i] , words); //分词部分 for ( int j = 0 ; j < words.size() ; j++) { if ( find(terms.begin() , terms.end(),words[j] ) == terms.end()) terms.push_back(words[j]); } } } void TFIDFMeasure::GenerateTermFrequency() { //计算每个单词在每份文档中出现的概率 for ( int i = 0 ; i < _numDocs ; i++) { string curDoc = _docs[i]; //当前待处理的文档 map<string,int> freq; this->GetWordFrequency(curDoc ,freq); map<string,int>::iterator iter; _maxTermFreq[i] = numeric_limits<int>::min(); for ( iter = freq.begin() ; iter != freq.end() ; iter++) { string word = iter->first; int wordFreq = iter->second; int termIndex = GetTermIndex(word); //单词下标 if ( termIndex == -1) continue; _termFreq[termIndex][i] = wordFreq; _docFreq[termIndex]++; if ( wordFreq > _maxTermFreq[i]) _maxTermFreq[i] = wordFreq; } } } int TFIDFMeasure::GetTermIndex(const string & term) { map<string , int> ::iterator pos = _wordIndex.find(term); if ( pos != _wordIndex.end()) return pos->second; else return -1; } class WordComp { public: WordComp(string& sWord) : word(sWord) { } bool operator() (const string& lhs) { return lhs.compare(word)==0; } private: string word; }; void TFIDFMeasure::GetWordFrequency( string & input , map<string,int>& freq) { //计算单词频率 transform(input.begin(),input.end(),input.begin(),tolower); StrVec temp; this->_tokeniser->Partition(input , temp); unique(temp.begin() , temp.end()); StrVec::iterator iter; for ( iter = temp.begin() ; iter != temp.end() ; iter++) { int count = CountWords(*iter , temp); //计算单词在文档中出现的次数 freq[*iter] = count; } } int TFIDFMeasure::CountWords(string & word ,const StrVec& temp) { //计算每个单词在该文档的词频数目 int ncount = 0 ; ncount = count_if(temp.begin() , temp.end() , WordComp(word)); return ncount ; } void TFIDFMeasure::GenerateTermWeight() { for (int i = 0 ; i < _numTerms ; i++) for (int j = 0 ; j < _numDocs ; j++) _termWeight[i][j] = ComputeTermWeight( i , j ); } double TFIDFMeasure::ComputeTermWeight(int term , int doc) { float tf = GetTermFrequency(term , doc); float idf = GetInverseDoucumentFrequency(term); return tf * idf ; } double TFIDFMeasure::GetTermFrequency(int term , int doc) { int freq = _termFreq[term][doc]; //词频 int maxfreq = _maxTermFreq[doc]; return ((float) freq /(float)maxfreq); } double TFIDFMeasure::GetInverseDoucumentFrequency(int term) { int df = _docFreq[term]; return log((float)(_numDocs)/(float)df); } void TFIDFMeasure::GetTermVector(int doc ,DoubleVec& vec) { vec.resize(this->_numTerms); for ( int i = 0 ; i < this->_numTerms ; i++) vec[i] = _termWeight[i][doc]; }
计算余弦相似性距离
#pragma once #include "common.h" class TermVector { public: static double ComputerCosineSimilarity(const DoubleVec& vector1 , const DoubleVec& vector2 ); static double innerProduct(const DoubleVec& v1 ,const DoubleVec& v2); static double VectorLength(const DoubleVec & v); }; #include "TermVector.h" #include <cmath> double TermVector::ComputerCosineSimilarity(const DoubleVec & v1 , const DoubleVec& v2) { if ( v1.size() != v2.size()) throw string("different length"); double denom = (VectorLength(v1) * VectorLength(v2)); if ( denom == 0 ) return 0 ; else return (innerProduct(v1 , v2) / denom); } double TermVector::innerProduct(const DoubleVec & v1 , const DoubleVec& v2) { if ( v1.size() != v2.size()) throw string ("different length"); double result = 0.0f; for ( int i = 0 ; i < v1.size() ; i++) result+=v1[i]*v2[i]; return result; } double TermVector::VectorLength(const DoubleVec & v) { double sum = 0.0f; for ( int i = 0 ; i < v.size() ; i++) sum= sum+(v[i] * v[i]); return (double)sqrt(sum); }
定义cluster的类
#pragma once #include "common.h" class Cluster { public: IntVec CurrentMembership; //该类簇的数据成员索引 DoubleVec Mean ; //该簇类的聚类中心 Cluster(); ~Cluster(); Cluster(int dataindex , DoubleVec & data); void UpdateMean(Double2DVec & coordinates); }; #include "cluster.h" Cluster::Cluster() { } Cluster::Cluster(int dataindex , DoubleVec& data) { CurrentMembership.push_back(dataindex); copy(data.begin() , data.end() ,back_inserter(Mean)); } void Cluster::UpdateMean(Double2DVec & coordinates) { //根据 mcurrentmembership取得原始资料点对象 //根据该子集的均值,corrdinate是一个m* n的矩阵,其实就是要求每列的均值 for (int i = 0 ; i< CurrentMembership.size();i++) { DoubleVec& coord = coordinates[CurrentMembership[i]]; for ( int j = 0 ; j < coord.size() ; j++) Mean[j]+=coord[j]; for (int k = 0 ; k <Mean.size() ; k++) Mean[k] /= coord.size(); } } Cluster::~Cluster() { }
#pragma once #include "common.h" class Cluster; class KMeans { public: vector<Cluster*> _clusters; KMeans(Double2DVec& data, int K); void Start(); ~KMeans(); private: int _coordCount; //数据的数量 Double2DVec _coordinates;//原始数据 int _k; //聚类的簇个数 IntVec _clusterAssignments; IntVec _nearestCluster; Double2DVec _distanceCache; void InitRandom(); static double getDistance(const DoubleVec & coord ,const DoubleVec& center); int NearestCluster(int ndx); }; #include "kmean.h" #include <time.h> #include "cluster.h" #include "TermVector.h" #include <limits> KMeans::KMeans(Double2DVec &data , int k ) { int i ; this->_coordinates.resize(data.size()); for ( i = 0 ; i <data.size() ; i++) copy(data[i].begin() , data[i].end(),back_inserter(_coordinates[i])); _coordCount = data.size(); _k = k; _clusters.resize(k); _clusterAssignments.resize(_coordCount); _nearestCluster.resize(_coordCount); _distanceCache.resize(_coordCount); for ( int i = 0 ; i <_coordCount ; i++) _distanceCache[i].resize(_coordCount); InitRandom(); } void KMeans::InitRandom() { srand(unsigned(time(NULL))); for (int i = 0 ; i < _k ; i++) { int temp = rand() %(_coordCount); //产生随机数 _clusterAssignments[temp] = i; _clusters[i] = new Cluster(temp ,_coordinates[temp]); } } void KMeans::Start() { int iter = 0 , i , j ; while ( true) { cout <<"Iteration " << iter++ << " ...." <<endl; //重新计算每个簇类的均值 for ( int i = 0 ; i <_k ; i++) { _clusters[i]->UpdateMean(_coordinates); } //计算每个数据和每个簇类中心的距离 for ( i = 0 ; i <_coordCount ; i++) { for ( j = 0 ; j <_k ; j++) { double dist = getDistance(_coordinates[i],_clusters[j]->Mean); _distanceCache[i][j] = dist; } } //计算每个数据离簇类最近 for ( i = 0 ; i <_coordCount ; i++) _nearestCluster[i] = this->NearestCluster(i); int k = 0 ; for ( i = 0 ; i <_coordCount ; i++) { if (_nearestCluster[i] == _clusterAssignments[i]) k++; } if ( k == _coordCount) break; for ( j = 0 ; j < _k ; j++) { _clusters[j]->CurrentMembership.clear(); } for ( i = 0 ; i <_coordCount ; i++) { _clusters[_nearestCluster[i]]->CurrentMembership.push_back(i); _clusterAssignments[i] = _nearestCluster[i]; } } } double KMeans::getDistance(const DoubleVec& coord , const DoubleVec& center) { return 1 - TermVector::ComputerCosineSimilarity(coord,center); } int KMeans::NearestCluster(int ndx) { int near = -1 ; double min = numeric_limits<double>::max(); for ( int c = 0 ; c <_k ; c++) { double d = _distanceCache[ndx][c]; if ( d < min) { min = d ; near = c ; } } return near; } KMeans::~KMeans() { vector<Cluster*>::iterator iter; for ( iter = this->_clusters.begin(); iter!=_clusters.end() ; iter++) delete (*iter); _clusters.clear(); }
#include "TF_IDF.h" #include "Tokeniser.h" #include <fstream> #include "kmean.h" #include "cluster.h" int main() { // 读入文档数据 StrVec strVec; ifstream inFile("c:\\input.txt"); string tempstr; while ( getline(inFile , tempstr)) { strVec.push_back(tempstr); } TFIDFMeasure tf(strVec , new Tokeniser()); int K =3 ; //聚类的个数 int docCount = strVec.size(); //生成k-mean的输入数据 Double2DVec data; data.resize(docCount); int dimension = tf.NumTerm(); for ( int i = 0 ; i < docCount ; i++) { tf.GetTermVector( i , data[i]); //获取第i个文档的TFIDF权重向量 } KMeans kmeans(data , K ); kmeans.Start(); vector<Cluster*> clusters = kmeans._clusters; vector<Cluster*>::iterator iter; IntVec::iterator it2 ; for ( iter = clusters.begin() ; iter != clusters.end() ; iter++) { cout <<"------------------------------------" <<endl; IntVec & vec = (*iter)->CurrentMembership; for ( it2 = vec.begin() ; it2 != vec.end() ; it2++) cout <<strVec[*it2] <<endl; } system("pause"); return 0 ; }