蛙蛙推荐:蛙蛙教你索引邮件
蛙蛙推荐:蛙蛙教你索引邮件
困了,不多说了,就是教你怎样把邮件建立索引,再搜索出来。用MAPI把邮件读取到数据库里,用SharpICTCLAS做一个lucene的中文的语汇单元分析器,用lucene建立索引及查询索引。
把某目录邮件读取到数据库里的代码很简单
{
_Application appOutlook = new Application();
NameSpace outlookNS = appOutlook.GetNamespace("MAPI");
outlookNS.Logon("", null, null, null);
Console.WriteLine(outlookNS.Folders.Count);
MAPIFolder inboxFolder = outlookNS.Folders[1].Folders["chat"];
HandlerFolder(inboxFolder);
outlookNS.Logoff();
}
private static void HandlerFolder(MAPIFolder inboxFolder)
{
foreach (object item in inboxFolder.Items)
{
MailItem mi = item as MailItem;
if (mi != null)
{
Console.WriteLine("ReceivedTime:{0}\r\nSubject:{1}", mi.ReceivedTime, mi.Subject);
try
{
MailDAO.Add_Mail_SSH(mi.Subject, mi.Body, mi.ReceivedTime, mi.To,
mi.SenderName, mi.SenderEmailAddress, mi.CC ?? "", mi.BCC ?? "");
}
catch (Exception ex)
{
Console.WriteLine(ex);
}
}
}
}
为了方便测试,先弄一些假数据
{
public static IList<Email> GetMails()
{
List<Email> ret = new List<Email>();
Email mail = new Email();
mail.Subject = "倡议:大家做一个.net开源的灾难管理系统";
mail.Body = "倡议:大家做一个.net开源的灾难管理系统";
mail.MailTo = "onlytiancai@sohu.com;onlytiancai@163.com;onlytiancai@msn.com;";
mail.Cc = "onlytiancai@yahoo.com.cn;onlytiancai@126.com";
mail.Bcc = "onlytiancai@qq.com;onlytiancai@sina.com;onlytiancai@gmail.com";
mail.SenderEmailAddress = "onlytiancai@live.com";
mail.SenderName = "蛙蛙王子";
mail.ReceiveTime = DateTime.Parse("2008-05-18 12:19");
ret.Add(mail);
mail = new Email();
mail.Subject = "[置顶]蛙蛙推荐:蛙蛙教你文本聚类";
mail.Body = "[置顶]蛙蛙推荐:蛙蛙教你文本聚类";
mail.MailTo = "onlytiancai@sohu.com;onlytiancai@163.com;onlytiancai@msn.com;";
mail.Cc = "onlytiancai@yahoo.com.cn;onlytiancai@126.com";
mail.Bcc = "onlytiancai@qq.com;onlytiancai@sina.com;onlytiancai@gmail.com";
mail.SenderEmailAddress = "onlytiancai@live.com";
mail.SenderName = "蛙蛙王子";
mail.ReceiveTime = DateTime.Parse("2008-05-10 20:43");
ret.Add(mail);
mail = new Email();
mail.Subject = "蛙蛙推荐:蛙蛙牌关键词提取算法";
mail.Body = "蛙蛙推荐:蛙蛙牌关键词提取算法";
mail.MailTo = "onlytiancai@sohu.com;onlytiancai@163.com;onlytiancai@msn.com;";
mail.Cc = "onlytiancai@yahoo.com.cn;onlytiancai@126.com";
mail.Bcc = "onlytiancai@qq.com;onlytiancai@sina.com;onlytiancai@gmail.com";
mail.SenderEmailAddress = "onlytiancai@live.com";
mail.SenderName = "蛙蛙王子";
mail.ReceiveTime = DateTime.Parse("2008-05-11 23:34");
ret.Add(mail);
mail = new Email();
mail.Subject = "蛙蛙推荐:蛙蛙牌软件注册码算法";
mail.Body = "蛙蛙推荐:蛙蛙牌软件注册码算法";
mail.MailTo = "onlytiancai@sohu.com;onlytiancai@163.com;onlytiancai@msn.com;";
mail.Cc = "onlytiancai@yahoo.com.cn;onlytiancai@fetionmm.com";
mail.Bcc = "onlytiancai@qq.com;onlytiancai@sina.com;onlytiancai@gmail.com";
mail.SenderEmailAddress = "onlytiancai@live.com";
mail.SenderName = "蛙蛙王子";
mail.ReceiveTime = DateTime.Parse("2008-05-03 21:55");
ret.Add(mail);
return ret;
}
}
像收件人,抄送地址等用一个简单的语汇单元分析器就可以,代码如下,因为token的位置信息只在分析的时候有用,只有位置增量才会写到索引里,所以位置信息都写了-1
/// 用于语汇单元化用简单分割字符隔开的字符串
/// </summary>
public class WawaSimpleTokenizer : Tokenizer
{
private readonly string _txt;
private List<string> _filter;
private int _current;
private bool _isStart = true;
private int _max;
public WawaSimpleTokenizer(TextReader reader)
{
input = reader;
_txt = input.ReadToEnd();
_txt = _txt.ToLower().Replace("'", "").Replace('"', ' ');
}
public override Token Next()
{
if (_isStart)
{
Regex r = new Regex("([ \\t{}():;. \n])");
String[] tokens = r.Split(_txt);
_filter = new List<string>();
for (int i = 0; i < tokens.Length; i++)
{
MatchCollection mc = r.Matches(tokens[i]);
if (mc.Count <= 0 && tokens[i].Trim().Length > 0)
_filter.Add(tokens[i]);
}
_max = _filter.Count-1;
_current = 0;
_isStart = false;
}
if (_current > _max)
return null;
string word = _filter[_current];
Token token = new Token(word, -1, -1);
_current++;
return token;
}
}
把lucene适配成lucene的分析器,有人做了,直接拿过来,链接如下
http://www.cnblogs.com/birdshover/archive/2008/03/26/1122305.html
建立索引的代码如下,写了些注释,不多说了就
{
protected string _indexDirectory;
protected IndexWriter _writer = null;
protected virtual void setUp()
{
string dictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar; //词库路径
SharpICTCLASAnalyzer sharpICTCLASAnalyzer = new SharpICTCLASAnalyzer(dictPath); //中文分析器
//按域分析包装器,邮件标题、正文等用中文分析器分析
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(sharpICTCLASAnalyzer);
WawaSimpleAnalyzer simpleAnalyzer = new WawaSimpleAnalyzer(); //按分隔符语汇单元化的分析器
//收件人,抄送和密送地址用简单解析器
wrapper.AddAnalyzer("MailTo", simpleAnalyzer);
wrapper.AddAnalyzer("Cc", simpleAnalyzer);
wrapper.AddAnalyzer("Bcc", simpleAnalyzer);
_indexDirectory = Path.Combine(Environment.CurrentDirectory, "mailindex") + Path.DirectorySeparatorChar;
; //索引路径
_writer = new IndexWriter(_indexDirectory, wrapper, true); //创建IndexWriter
_writer.SetUseCompoundFile(true); //显式设置索引为复合索引
_writer.SetMaxFieldLength(int.MaxValue); //设置域最大长度为最大值
_writer.SetMergeFactor(100); //设置每100个段合并成一个大段
_writer.SetMaxMergeDocs(10000); //设置一个段的最大文档数
_writer.SetMaxBufferedDocs(1000); //设置在把索引写入磁盘前内存里文档的缓存个数
}
public void ExeCute()
{
try
{
setUp();
IList<Email> mails = MailDAO.GetMails();
foreach (Email mail in mails)
{
try
{
Console.WriteLine("正在索引:{0}", mail.Subject);
Document doc = new Document();
doc.Add(new Field("Subject", mail.Subject ?? "",
Field.Store.YES, Field.Index.TOKENIZED)); //邮件主题,分词、索引、存储——
doc.Add(new Field("Body", mail.Body ?? "",
Field.Store.NO, Field.Index.TOKENIZED)); //邮件正文,分词、索引,不存储
doc.Add(new Field("ReceiveTime", mail.ReceiveTime.ToString("yyyyMMdd"),
Field.Store.YES, Field.Index.UN_TOKENIZED)); //接受时间,不分词,不索引,保存
doc.Add(new Field("MailTo", mail.MailTo ?? "",
Field.Store.YES, Field.Index.TOKENIZED)); //收件人,分词,索引,保存
doc.Add(new Field("SenderName", mail.SenderName ?? "",
Field.Store.YES, Field.Index.UN_TOKENIZED)); //发送人姓名,不分词,索引,保存
doc.Add(new Field("SenderEmailAddress", mail.SenderEmailAddress ?? "",
Field.Store.YES, Field.Index.UN_TOKENIZED)); //发送人邮件,不分词,索引,保存
doc.Add(new Field("Cc", mail.Cc ?? "",
Field.Store.YES, Field.Index.NO)); //抄送,分词,索引,保存
doc.Add(new Field("Bcc", mail.Bcc ?? "",
Field.Store.YES, Field.Index.NO)); //密送,分词、索引,保存
_writer.AddDocument(doc);
}
catch (Exception ex)
{
Console.WriteLine("索引出错:{0},{1}", mail.Subject, ex);
}
}
}
catch (Exception ex)
{
Console.WriteLine("Run:{0}", ex);
}
finally
{
close();
}
}
private void close()
{
try
{
_writer.Optimize();
_writer.Close();
}
catch (Exception ex)
{
Console.WriteLine("Close:{0}", ex);
}
}
}
搜索的代码如下
{
protected string _indexDirectory;
protected IndexSearcher _searcher = null;
private MultiFieldQueryParser _mfqp;
public MailSearcher()
{
_indexDirectory = Path.Combine(Environment.CurrentDirectory, "mailindex") + Path.DirectorySeparatorChar; ; //索引路径
_searcher = new IndexSearcher(_indexDirectory);
string dictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar; //词库路径
SharpICTCLASAnalyzer sharpICTCLASAnalyzer = new SharpICTCLASAnalyzer(dictPath); //中文分析器
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(sharpICTCLASAnalyzer);
WawaSimpleAnalyzer simpleAnalyzer = new WawaSimpleAnalyzer(); //按分隔符语汇单元化的分析器
//收件人,抄送和密送地址用简单解析器
wrapper.AddAnalyzer("MailTo", simpleAnalyzer);
wrapper.AddAnalyzer("Cc", simpleAnalyzer);
wrapper.AddAnalyzer("Bcc", simpleAnalyzer);
string[] fields = new string[] { "Subject", "Body", "MailTo", "Cc", "Bcc" };
_mfqp = new MultiFieldQueryParser(fields, wrapper);
}
public Hits Search(string queryStr)
{
Query q = _mfqp.Parse(queryStr);
Hits result = _searcher.Search(q);
return result;
}
public static void ShowHits(Hits hits)
{
Console.WriteLine("共有{0}个结果",hits.Length());
for(int i = 0;i<hits.Length();i++)
{
string subject = hits.Doc(i).Get("Subject");
string receiveTime = hits.Doc(i).Get("ReceiveTime");
Console.WriteLine("{0}-{1}",receiveTime,subject);
}
}
}
最后整个的测试代码如下
{
Console.WriteLine("索引");
MailIndexer indexer = new MailIndexer();
indexer.ExeCute();
Console.WriteLine("搜索");
MailSearcher searcher = new MailSearcher();
Console.WriteLine("搜索包含'蛙蛙'的邮件");
MailSearcher.ShowHits(searcher.Search("蛙蛙"));
Console.WriteLine("搜索包含'蛙蛙'且包含'聚类'的邮件");
MailSearcher.ShowHits(searcher.Search("蛙蛙 +聚类"));
Console.WriteLine("搜索包含接受时间从20080510到20080513的邮件");
MailSearcher.ShowHits(searcher.Search("ReceiveTime:[20080510 TO 20080513]"));
Console.WriteLine("ok");
Console.Read();
}
最后的结果代码应该如下
索引
正在索引:倡议:大家做一个.net开源的灾难管理系统
正在索引:[置顶]蛙蛙推荐:蛙蛙教你文本聚类
正在索引:蛙蛙推荐:蛙蛙牌关键词提取算法
正在索引:蛙蛙推荐:蛙蛙牌软件注册码算法
搜索
搜索包含'蛙蛙'的邮件
共有3个结果
20080511-蛙蛙推荐:蛙蛙牌关键词提取算法
20080510-[置顶]蛙蛙推荐:蛙蛙教你文本聚类
20080503-蛙蛙推荐:蛙蛙牌软件注册码算法
搜索包含'蛙蛙'且包含'聚类'的邮件
共有1个结果
20080510-[置顶]蛙蛙推荐:蛙蛙教你文本聚类
搜索包含接受时间从20080510到20080513的邮件
共有2个结果
20080510-[置顶]蛙蛙推荐:蛙蛙教你文本聚类
20080511-蛙蛙推荐:蛙蛙牌关键词提取算法
ok
完整源码下载如下
https://files.cnblogs.com/onlytiancai/MailIndexer.zip
其中词库,吕震宇的中文分词的程序集及lucene.net2.0的程序集请到网上搜索下载。