一个小应用,show下代码,为了推荐下我非常喜欢的glog,gflag:)感谢google 它们让我的生活更轻松:)
另外读取数据库用otl相当方便,我用otl封装了下写了一个DBReader,这样处理数据库基本就和处理文本一样了完全相同的接口,完全屏蔽了数据库的操作方便了很多。
/**
* ==============================================================================
*
* \file quechaokafei.cc
* \autho goldenlock
* \Description: 由于标题中 “雀巢 咖啡” 的存在, 造成用户搜”雀巢咖啡“搜不到产品
* 解决办法读取数据库中所有单品对于标题如果发现
* ”雀巢*咖啡“ ”咖啡*雀巢“ 则这个词加入到
* blacklist中 当前dict/quechaokafei.txt
*
* ==============================================================================
*/
#define private public
#define protected public
#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include "utils/db_reader.h"
#include <algorithm>
#include <boost/progress.hpp>
#include <glog/logging.h>
#include <gflags/gflags.h>
#include <tr1/unordered_set>
#include "gbk_ch_converter.h"
#include "string_help.h"
#include "config_help.h"
#include "debug_help.h"
#include "include/segmentor.h"
#include "include/gbk_datrie.h"
using namespace std;
DEFINE_string(type, "simple", "");
DEFINE_string(config, "read_db.ini", "数据库配置文件,读取其中的title数据");
DEFINE_string(section, "all_title", "读取其中的title数据");
DEFINE_string(o, "dict/quechaokafei.txt", "提取的黑名单词");
DEFINE_int32(min_len, 2, "单个词要求都>=2 当前");
DEFINE_string(prob_dir, "testNgramTitle", "概率分词器的souce dir");
struct QuechaokafeiFunc
{
typedef std::tr1::unordered_set<string> HashSet;
HashSet m_candidates;
ch_convert::ChConverter m_converter;
segment::ProbSegmentor m_seg;
segment::GBK_DATrie_ m_trie;
ofstream ofs;
QuechaokafeiFunc()
: m_seg(FLAGS_prob_dir), m_trie(m_seg.m_seg.m_trie, m_seg.m_seg.m_encoder), ofs(FLAGS_o.c_str())
{
}
void findQuechaokafei(const vector<string>& vec)
{
for (size_t i = 0; i < vec.size() - 1; i++)
{
if (vec[i].size() < FLAGS_min_len * 2 || !m_trie.search(vec[i]))
continue;
for (size_t j = i + 1; j < vec.size(); j++)
{
if (vec[j].size() < FLAGS_min_len * 2)
continue;
if (m_trie.search(vec[j]))
{
string s1 = vec[i] + vec[j];
string s2 = vec[j] + vec[i];
if (m_trie.search(s1))
{
m_candidates.insert(s1);
}
if (m_trie.search(s2))
{
m_candidates.insert(s2);
}
}
}
}
}
void writeResult()
{
std::copy(m_candidates.begin(), m_candidates.end(), ostream_iterator<string>(ofs, "\n"));
}
template<typename Stream>
void operator()(Stream & os)
{
string key;
vector<string> vec;
while (!os.eof())
{
os >> key;
//---规则化处理key
key = m_converter.Normalize(key);
if (key.empty())
continue;
key = filterString2(key);
if (key.empty())
continue;
m_seg.maxSegment(key, vec);
findQuechaokafei(vec);
}
}
};
void run()
{
DBReader db_reader;
db_reader.init(FLAGS_config, FLAGS_section);
QuechaokafeiFunc quechaokafei_func;
db_reader.process(quechaokafei_func);
quechaokafei_func.writeResult();
}
int main(int argc, char *argv[])
{
FLAGS_logtostderr = true;
google::InitGoogleLogging(argv[0]);
google::InstallFailureSignalHandler();
int s = google::ParseCommandLineFlags(&argc, &argv, false);
boost::progress_timer timer;
run();
return 0;
}