在字典中却掉影响召回的长词

一个小应用,show下代码,为了推荐下我非常喜欢的glog,gflag:)感谢google 它们让我的生活更轻松:)
另外读取数据库用otl相当方便,我用otl封装了下写了一个DBReader,这样处理数据库基本就和处理文本一样了完全相同的接口,完全屏蔽了数据库的操作方便了很多。
 
/** 
 *  ==============================================================================
 * 
 *          \file   quechaokafei.cc
 *          \autho goldenlock  
 *          \Description:  由于标题中 “雀巢 咖啡” 的存在, 造成用户搜”雀巢咖啡“搜不到产品
 *                         解决办法读取数据库中所有单品对于标题如果发现 
 *                         ”雀巢*咖啡“ ”咖啡*雀巢“ 则这个词加入到
 *                         blacklist中  当前dict/quechaokafei.txt
 *
 *  ==============================================================================
 */

#define private public
#define protected public
#include <iostream>
#include <string>
#include <vector>
#include <fstream>

#include "utils/db_reader.h"
#include <algorithm>
#include <boost/progress.hpp>
#include <glog/logging.h>
#include <gflags/gflags.h>

#include <tr1/unordered_set>
#include "gbk_ch_converter.h"
#include "string_help.h"
#include "config_help.h"
#include "debug_help.h"
#include "include/segmentor.h"
#include "include/gbk_datrie.h"

using namespace std;
DEFINE_string(type, "simple", "");
DEFINE_string(config, "read_db.ini", "数据库配置文件,读取其中的title数据");
DEFINE_string(section, "all_title", "读取其中的title数据");
DEFINE_string(o, "dict/quechaokafei.txt", "提取的黑名单词");
DEFINE_int32(min_len, 2, "单个词要求都>=2 当前");
DEFINE_string(prob_dir, "testNgramTitle", "概率分词器的souce dir");

struct QuechaokafeiFunc
{
    typedef std::tr1::unordered_set<string> HashSet;
    HashSet m_candidates;
    ch_convert::ChConverter m_converter;
    segment::ProbSegmentor m_seg;
    segment::GBK_DATrie_ m_trie;
    ofstream ofs;

    QuechaokafeiFunc()
    : m_seg(FLAGS_prob_dir), m_trie(m_seg.m_seg.m_trie, m_seg.m_seg.m_encoder), ofs(FLAGS_o.c_str())
    {

    }

    void findQuechaokafei(const vector<string>& vec)
    {
        for (size_t i = 0; i < vec.size() - 1; i++)
        {
            if (vec[i].size() < FLAGS_min_len * 2 || !m_trie.search(vec[i]))
                continue;
            for (size_t j = i + 1; j < vec.size(); j++)
            {
                if (vec[j].size() < FLAGS_min_len * 2)
                    continue;
                if (m_trie.search(vec[j]))
                {
                    string s1 = vec[i] + vec[j];
                    string s2 = vec[j] + vec[i];
                    if (m_trie.search(s1))
                    {
                        m_candidates.insert(s1);
                    }

                    if (m_trie.search(s2))
                    {
                        m_candidates.insert(s2);
                    }
                }

            }
        }
    }

    void writeResult()
    {
        std::copy(m_candidates.begin(), m_candidates.end(), ostream_iterator<string>(ofs, "\n"));
    }

    template<typename Stream>
            void operator()(Stream & os)
    {
        string key;
        vector<string> vec;
        while (!os.eof())
        {
            os >> key;
            //---规则化处理key
            key = m_converter.Normalize(key);
            if (key.empty())
                continue;
            key = filterString2(key);
            if (key.empty())
                continue;
            m_seg.maxSegment(key, vec);
            findQuechaokafei(vec);
        }
    }
};

void run()
{
    DBReader db_reader;
    db_reader.init(FLAGS_config, FLAGS_section);
    QuechaokafeiFunc quechaokafei_func;
    db_reader.process(quechaokafei_func);
    quechaokafei_func.writeResult();
}

int main(int argc, char *argv[])
{
    FLAGS_logtostderr = true;
    google::InitGoogleLogging(argv[0]);
    google::InstallFailureSignalHandler();
    int s = google::ParseCommandLineFlags(&argc, &argv, false);
    boost::progress_timer timer;

    run();

    return 0;
}
posted @ 2011-01-10 12:02  阁子  阅读(546)  评论(0编辑  收藏  举报