lecene.net实现pdf，doc，xls，ppt，htm，html等格式文件的检索

Posted on 2008-06-11 18:16 weekzero 阅读(107385) 评论(2) 收藏举报

代码如下，代码没有优化，仅实现功能
该代码复制到程序中不能直接使用，需要下载文章最后的例子，取得其中得dll后才可以

using System;
using System.Configuration;
using System.Data;
using System.Linq;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Xml.Linq;
using System.Text;
using System.IO;

using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.QueryParsers;
using Lucene.Net.Analysis.Standard;

using Lucene.Net.Analysis.Cn;

using org.pdfbox.pdmodel;
using org.pdfbox.util;

using System.Text.RegularExpressions;

public partial class _Default : System.Web.UI.Page
{
    public DateTime start = new DateTime();
    delegate void AsyncIndexDirectoryCaller(IndexWriter writer, FileInfo file);
    IndexSearcher searcher = null;

    protected void Page_Load(object sender, EventArgs e)
    {
        if (!IsPostBack)
            TextBox3.Text = Server.MapPath("doc");
    }

    #region 建立索引
    protected void Button2_Click(object sender, EventArgs e)
    {
        string INDEX_STORE_PATH = Server.MapPath("index");  //INDEX_STORE_PATH 为索引存储目录
        string INDEX_PATH = TextBox3.Text;  //INDEX_PATH 为搜索目录

        IndexWriter writer = null;
        try
        {
            writer = new IndexWriter(INDEX_STORE_PATH, new ChineseAnalyzer(), true);
            start = DateTime.Now;

            IndexDirectory(writer, new FileInfo(INDEX_PATH));
            writer.Optimize();
            writer.Close();

            TimeSpan s = DateTime.Now - start;

            TextBox1.Text = "提示：索引完成，共用时 " + s.TotalSeconds + " 秒\n";

        }
        catch (Exception ex)
        {
            TextBox4.Text = ex.Message.ToString();
        }

    }

    public void IndexDirectory(IndexWriter writer, FileInfo file)
    {
        if (Directory.Exists(file.FullName))
        {
            String[] files = Directory.GetFileSystemEntries(file.FullName);

            if (files != null)
            {
                for (int i = 0; i < files.Length; i++)
                {
                    IndexDirectory(writer, new FileInfo(files[i]));  //这里是一个递归
                }
            }
        }
        else if (file.Extension.ToLower() == ".txt" || file.Extension.ToLower() == ".htm" || file.Extension.ToLower() == ".html" || file.Extension.ToLower() == ".pdf" || file.Extension.ToLower() == ".doc" || file.Extension.ToLower() == ".rtf" || file.Extension.ToLower() == ".ppt" || file.Extension.ToLower() == ".xls")
        {
            IndexFile(file, writer);
        }
    }

    private void IndexFile(FileInfo file, IndexWriter writer)
    {

        try
        {
            if (file.Extension.ToLower() == ".pdf")
            {
                Document doc = new Document();

                PDDocument pddoc = PDDocument.load(file.FullName);
                PDFTextStripper stripper = new PDFTextStripper();

                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(new Field("contents", stripper.getText(pddoc), Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);
            }
            else if (file.Extension.ToLower() == ".doc")
            {
                Document doc = new Document();
                string str = "";
                //
                Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
                object filePath = file.FullName;
                object nullobj = System.Reflection.Missing.Value;
                Microsoft.Office.Interop.Word.Document docdoc = wordApp.Documents.Open(
                    ref filePath, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                    ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                docdoc.ActiveWindow.Selection.WholeStory();

                str = docdoc.ActiveWindow.Selection.Text.ToString();
                docdoc.Close(ref nullobj, ref nullobj, ref nullobj);
                wordApp.Quit(ref nullobj, ref nullobj, ref nullobj);
                //

                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);

            }
            else if (file.Extension.ToLower() == ".rtf")    //word的方式可以解决rtf文件的读取
            {
                Document doc = new Document();
                string str = "";
                //
                Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
                object filePath = file.FullName;
                object nullobj = System.Reflection.Missing.Value;
                Microsoft.Office.Interop.Word.Document docdoc = wordApp.Documents.Open(
                    ref filePath, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                    ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                docdoc.ActiveWindow.Selection.WholeStory();

                str = docdoc.ActiveWindow.Selection.Text.ToString();
                docdoc.Close(ref nullobj, ref nullobj, ref nullobj);
                wordApp.Quit(ref nullobj, ref nullobj, ref nullobj);
                //

                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);
            }
            else if (file.Extension.ToLower() == ".ppt")
            {
                Document doc = new Document();
                string str = "";
                //
                PowerPoint.ApplicationClass pptApp = new PowerPoint.ApplicationClass();
                PowerPoint.Presentation pptPre = pptApp.Presentations.Open(file.FullName,
                            Microsoft.Office.Core.MsoTriState.msoTrue,
                            Microsoft.Office.Core.MsoTriState.msoFalse,
                            Microsoft.Office.Core.MsoTriState.msoFalse);

                foreach (PowerPoint.Slide slide in pptPre.Slides)
                {
                    foreach (PowerPoint.Shape shape in slide.Shapes)
                    {
                        try
                        {
                            str = str + shape.TextFrame.TextRange.Text;
                        }
                        catch { }
                    }
                }
                pptPre.Close();
                pptApp.Quit();
                //

                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);
            }
            else if (file.Extension.ToLower() == ".xls")
            {
                Document doc = new Document();
                string str = "";

                //
                Microsoft.Office.Interop.Excel.Application xApp = new Microsoft.Office.Interop.Excel.ApplicationClass();
                //xApp.Visible = true;

                object nullobj = System.Reflection.Missing.Value;

                Microsoft.Office.Interop.Excel.Workbook xBook = xApp.Workbooks._Open(file.FullName,
                nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj);

                Microsoft.Office.Interop.Excel.Worksheet xSheet;
                int rcount, ccount;

                for (int i = 0; i < xBook.Sheets.Count; i++)
                {
                    xSheet = (Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[i + 1];

                    rcount = xSheet.UsedRange.Rows.Count;
                    ccount = xSheet.UsedRange.Columns.Count;

                    for (int m = 0; m < rcount; m++)
                    {
                        for (int n = 0; n < ccount; n++)
                        {
                            str = str + ((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m + 1, n + 1]).Value2;
                        }
                    }

                }
                xSheet = null;
                xBook.Close(nullobj, nullobj, nullobj);
                xApp.Quit();
                //

                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);

            }
            else if (file.Extension.ToLower() == ".htm" || file.Extension.ToLower() == ".html")
            {

                Document doc = new Document();
                string str = "";
                str = NoHTML(File.ReadAllText(file.FullName));

                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(new Field("contents", new StreamReader(file.FullName, System.Text.Encoding.Default)));

                writer.AddDocument(doc);
            }
            else    //默认是文本文件
            {
                Document doc = new Document();

                doc.Add(new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(new Field("contents", new StreamReader(file.FullName, System.Text.Encoding.Default)));

                writer.AddDocument(doc);
            }
        }

        catch (FileNotFoundException fnfe)
        {
            TextBox4.Text = TextBox4.Text + fnfe.Message + "\n";
            return;
        }
    }

    public static string NoHTML(string Htmlstring)//过滤调html的标签
    {
        //删除脚本
        Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
        //删除HTML
        Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
        Htmlstring.Replace("<", "");
        Htmlstring.Replace(">", "");
        Htmlstring.Replace("\r\n", "");
        Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
        return Htmlstring;
    }
    #endregion

    #region 搜索
    protected void Button1_Click(object sender, EventArgs e)
    {
        string INDEX_STORE_PATH = Server.MapPath("index");  //INDEX_STORE_PATH 为索引存储目录
        string KEYWORD = TextBox2.Text;

        try
        {
            searcher = new IndexSearcher(INDEX_STORE_PATH);

            QueryParser q = new QueryParser("contents", new ChineseAnalyzer());

            Query query = q.Parse(KEYWORD);

            Hits hits = searcher.Search(query);

            printResult(hits);

            searcher.Close();
        }
        catch (Exception ex)
        {
            TextBox4.Text = TextBox4.Text + ex.Message.ToString();
        }
    }

    void printResult(Hits h)
    {
        string str = "";
        if (h.Length() == 0)
        {
            str = str + "对不起，没有搜索到你要的结果。\n";
        }
        else
        {
            for (int i = 0; i < h.Length(); i++)
            {
                try
                {
                    Document doc = h.Doc(i);
                    str = str + "这是第" + (i + 1) + "个搜索结果,文件路径为： " + doc.Get("filename") + "\n";
                }
                catch (Exception ex)
                {
                    TextBox4.Text = TextBox4.Text + ex.Message;
                }
            }
        }
        str = str + "---------------------------\n";
        TextBox1.Text = str;
    }

    #endregion

}

完整demo下载，点击下载

刷新页面返回顶部

星期零

公告

lecene.net实现pdf，doc，xls，ppt，htm，html等格式文件的检索