一个文本分词程序
WordMap类从分词库中读入分词
将分词存入unordered_map<std::string, int> 中
#pragma once #include<istream> #include<unordered_map> #include<string> #include<ctime> class WordMap { public: WordMap(const std::string& filename); ~WordMap(); bool init(); std::unordered_map<std::string, int> m_map; std::string m_filename; private: time_t difftime; std::string timestr( tm*); };
#include"wordmap.h" #include<fstream> #include<iostream> #include<sstream> #include<ctime> WordMap::WordMap(const std::string& filename):m_filename(filename), difftime(5) { } WordMap::~WordMap() {} bool WordMap::init() { std::ifstream input(m_filename); std::istringstream inputstring; time_t last; time(&last); time_t cur; if (input.is_open()) { std::string inputs; tm nowtime; localtime_s(&nowtime,&last); std::cout << "开始初始化分词库,当前时间" << timestr(&nowtime)<<std::endl; while (std::getline(input, inputs)) { time(&cur); std::istringstream inputstring(inputs); int num; int num2; std::string word; inputstring >> num; inputstring >> word; inputstring >> num2; m_map[word] = num2; if (cur - last > difftime) { std::cout << "已初始化分词个数:" << m_map.size() << std::endl; last = cur; } } time(&cur); localtime_s(&nowtime, &cur); std::cout << "结束初始化分词库,当前时间" << timestr(&nowtime) << std::endl; } else { std::cerr << "can't not open file:" << m_filename; return false; } return true; } std::string WordMap::timestr(tm* nowtime) { std::ostringstream out; out << nowtime->tm_hour << ":" << nowtime->tm_min << ":" << nowtime->tm_sec; return std::move(out.str()); }
从文本中读入,对文本进行分词,分词方法详见
http://yangshangchuan.iteye.com/blog/2031813
以下是实现
#pragma once #include<string> using std::string; #include<vector> using std::vector; #include"wordmap.h" class FindWord { public: FindWord() {}; ~FindWord() {}; vector<string> GetKeyWords(const string& filename,const WordMap& wordmap); private: int wsize = 5; bool ischinese(const char* c); public: int getlocalfindstring(const string& ostring, int begpos); };
@ -0,0 +1,71 @@ #include "findword.h" #include<fstream> #include<sstream> #include<iostream> using std::ifstream; using std::istringstream; vector<string> FindWord::GetKeyWords(const string & filename, const WordMap& wordmap) { vector<string> l_keyword; ifstream inputfile(filename); if (!inputfile.is_open()) { std::cerr << "cann't not open file:" << filename; return l_keyword; } string sinput; string last; while (std::getline(inputfile, sinput)) { last = sinput; int begpos = 0; int length; while ((length = getlocalfindstring(last, begpos)) != 0) { int movelen = ischinese(&last[begpos]) ? 2:1; int findlen = -1; while (movelen<=length) { string ls = last.substr(begpos, movelen); auto res = wordmap.m_map.find(ls); if (res != wordmap.m_map.end()) { findlen = movelen; } movelen += ischinese(&last[begpos + movelen]) ? 2 : 1; } if (findlen != -1) { l_keyword.push_back(last.substr(begpos, findlen)); begpos = begpos + findlen; } else { begpos += length; } } } return l_keyword; } bool FindWord::ischinese(const char* c) { unsigned char cur = *c; unsigned char next = *(c + 1); if (next == 0)return false; return (cur >= 0xB0 && cur <= 0xF7) && (next >= 0xA1 && next <= 0xFE); } int FindWord::getlocalfindstring(const string& ostring,int begpos) { int size = wsize; int endpos = begpos; while (size > 0 && ostring[endpos]) { if (ischinese(&ostring[endpos])) { endpos++; } size--; endpos++; } return endpos-begpos; }
样例程序
@ -0,0 +1,16 @@ #include"wordmap.h" #include<iostream> #include<string> #include"findword.h" using std::string; int main() { WordMap m_wordmap("../../../word/word1.txt"); FindWord m_findword; if (!m_wordmap.init()) { return 0; }; vector<string> res= m_findword.GetKeyWords("../../../inputfile/1999.txt", m_wordmap); for (auto elems : res) std::cout << elems << " "; return 0; }
github:https://github.com/wuzhuorui/kjct.git