中文分词（机械传统方法）正向最大匹配 - lzhenf

公告

//ditionary.h
#include <iostream>
#include <string>
#include <hash_map>
#include <fstream>
#include <sstream>
using namespace std;

class Cditionary
{
public:
    Cditionary();
    ~Cditionary();
    int FindWord(string w);
private:
    string strtmp;
    string word;
    hash_map<string , int> wordhash;
    
};

Cditionary::Cditionary()
{
   ifstream infile("wordlist.txt");     // 打开词典
      if (!infile.is_open())     // 打开词典失败则退出程序
            {
                cerr << "Unable to open input file: " << "wordlexicon"<< " -- bailing out!" << endl;
                exit(-1);
            }
    while (getline(infile, strtmp, '\n'))     // 读入词典的每一行并将其添加入哈希中
    {
        istringstream istr(strtmp);
        istr >> word;     //读入每行第一个词
        wordhash[word] = 1;     //插入到哈希中
    }
}

Cditionary::~Cditionary()
{
}
int Cditionary::FindWord(string s)
{
    if (wordhash.find(s) != wordhash.end())
        return 1;
    else 
        return 0 ;


}

//main.cpp
#include "dictionary.h"
#define MaxWordLength 10
#define Sep "/"

Cditionary WordDic;

// 字符串用最大匹配法处理 
string SegmentSetence(string s1)
{
    string s2 = "";

    while (! s1.empty())
    {
        int len = s1.length();
        if (len > MaxWordLength)
            len = MaxWordLength;
        string temp = s1.substr(0, len);

        int n = WordDic.FindWord(temp);
        while (len > 2 && n == 0)
        {
            len -= 2;
            temp = temp.substr(0 , len);
            n = WordDic.FindWord(temp);
        }
        s2 += temp + Sep;
        s1 = s1.substr(temp.length(), s1.length());
    }
    return s2;
}

int main(int argc , char * argv[])
{
    string strtmp; // 用于保存从语料库中读入的每一行
    string line; // 用于输出每一行的结果

    ifstream infile(argv[1]); // 打开输入文件
    if (! infile.is_open()) // 打开输入文件失败则退出程序
{
        cerr << "Unable to open input file: " << " -- bailing out!" << endl;
        exit(-1);
}

    ofstream outfile1("result.txt"); // 确定输出文件
    if (! outfile1.is_open())
{
        cerr << "Unable to open file：SegmentResult.txt"
        << "--bailing out!" << endl;
        exit(-1);
}

    while (getline(infile, strtmp, 'n')) // 读入语料库中的每一行并用最大匹配法处理
{
        line = strtmp;
        line = SegmentSetence(line); // 调用分词函数进行分词处理
        outfile1 << line << endl; // 将分词结果写入目标文件
}

return 0;
}

原理参见：52NLP

每次取最大匹配到的长度，截取后重新继续匹配

posted on 2012-04-05 22:56 lzhenf 阅读(1099) 评论(0) 编辑收藏举报

刷新页面返回顶部