//ditionary.h
#include <iostream>
#include <string>
#include <hash_map>
#include <fstream>
#include <sstream>
using namespace std;
class Cditionary
{
public:
Cditionary();
~Cditionary();
int FindWord(string w);
private:
string strtmp;
string word;
hash_map<string , int> wordhash;
};
Cditionary::Cditionary()
{
ifstream infile("wordlist.txt"); // 打开词典
if (!infile.is_open()) // 打开词典失败则退出程序
{
cerr << "Unable to open input file: " << "wordlexicon"<< " -- bailing out!" << endl;
exit(-1);
}
while (getline(infile, strtmp, '\n')) // 读入词典的每一行并将其添加入哈希中
{
istringstream istr(strtmp);
istr >> word; //读入每行第一个词
wordhash[word] = 1; //插入到哈希中
}
}
Cditionary::~Cditionary()
{
}
int Cditionary::FindWord(string s)
{
if (wordhash.find(s) != wordhash.end())
return 1;
else
return 0 ;
}
//main.cpp
#include "dictionary.h"
#define MaxWordLength 10
#define Sep "/"
Cditionary WordDic;
// 字符串用最大匹配法处理
string SegmentSetence(string s1)
{
string s2 = "";
while (! s1.empty())
{
int len = s1.length();
if (len > MaxWordLength)
len = MaxWordLength;
string temp = s1.substr(0, len);
int n = WordDic.FindWord(temp);
while (len > 2 && n == 0)
{
len -= 2;
temp = temp.substr(0 , len);
n = WordDic.FindWord(temp);
}
s2 += temp + Sep;
s1 = s1.substr(temp.length(), s1.length());
}
return s2;
}
int main(int argc , char * argv[])
{
string strtmp; // 用于保存从语料库中读入的每一行
string line; // 用于输出每一行的结果
ifstream infile(argv[1]); // 打开输入文件
if (! infile.is_open()) // 打开输入文件失败则退出程序
{
cerr << "Unable to open input file: " << " -- bailing out!" << endl;
exit(-1);
}
ofstream outfile1("result.txt"); // 确定输出文件
if (! outfile1.is_open())
{
cerr << "Unable to open file:SegmentResult.txt"
<< "--bailing out!" << endl;
exit(-1);
}
while (getline(infile, strtmp, 'n')) // 读入语料库中的每一行并用最大匹配法处理
{
line = strtmp;
line = SegmentSetence(line); // 调用分词函数进行分词处理
outfile1 << line << endl; // 将分词结果写入目标文件
}
return 0;
}
原理参见:52NLP
每次取最大匹配到的长度,截取后重新继续匹配