星期零

技术改变生活,分享让我们快乐!
随笔 - 159, 文章 - 0, 评论 - 234, 阅读 - 44万
  博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

lecene.net实现pdf,doc,xls,ppt,htm,html等格式文件的检索

Posted on   weekzero  阅读(107366)  评论(2编辑  收藏  举报

代码如下,代码没有优化,仅实现功能
该代码复制到程序中不能直接使用,需要下载文章最后的例子,取得其中得dll后才可以

using System;
using System.Configuration;
using System.Data;
using System.Linq;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.HtmlControls;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Xml.Linq;
using System.Text;
using System.IO;

using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.QueryParsers;
using Lucene.Net.Analysis.Standard;

using Lucene.Net.Analysis.Cn;


using org.pdfbox.pdmodel;
using org.pdfbox.util;

using System.Text.RegularExpressions;

public partial class _Default : System.Web.UI.Page
{
    
public DateTime start = new DateTime();
    
delegate void AsyncIndexDirectoryCaller(IndexWriter writer, FileInfo file);
    IndexSearcher searcher 
= null;

    
protected void Page_Load(object sender, EventArgs e)
    {
        
if (!IsPostBack)
            TextBox3.Text 
= Server.MapPath("doc");
    }


    
#region 建立索引
    
protected void Button2_Click(object sender, EventArgs e)
    {
        
string INDEX_STORE_PATH = Server.MapPath("index");  //INDEX_STORE_PATH 为索引存储目录
        string INDEX_PATH = TextBox3.Text;  //INDEX_PATH 为搜索目录

        IndexWriter writer 
= null;
        
try
        {
            writer 
= new IndexWriter(INDEX_STORE_PATH, new ChineseAnalyzer(), true);
            start 
= DateTime.Now;

            IndexDirectory(writer, 
new FileInfo(INDEX_PATH));
            writer.Optimize();
            writer.Close();

            TimeSpan s 
= DateTime.Now - start;

            TextBox1.Text 
= "提示:索引完成,共用时 " + s.TotalSeconds + " 秒\n";

        }
        
catch (Exception ex)
        {
            TextBox4.Text 
= ex.Message.ToString();
        }


    }

    
public void IndexDirectory(IndexWriter writer, FileInfo file)
    {
        
if (Directory.Exists(file.FullName))
        {
            String[] files 
= Directory.GetFileSystemEntries(file.FullName);

            
if (files != null)
            {
                
for (int i = 0; i < files.Length; i++)
                {
                    IndexDirectory(writer, 
new FileInfo(files[i]));  //这里是一个递归 
                }
            }
        }
        
else if (file.Extension.ToLower() == ".txt" || file.Extension.ToLower() == ".htm" || file.Extension.ToLower() == ".html" || file.Extension.ToLower() == ".pdf" || file.Extension.ToLower() == ".doc" || file.Extension.ToLower() == ".rtf" || file.Extension.ToLower() == ".ppt" || file.Extension.ToLower() == ".xls")
        {
            IndexFile(file, writer);
        }
    }

    
private void IndexFile(FileInfo file, IndexWriter writer)
    {

        
try
        {
            
if (file.Extension.ToLower() == ".pdf")
            {
                Document doc 
= new Document();

                PDDocument pddoc 
= PDDocument.load(file.FullName);  
                PDFTextStripper stripper 
= new PDFTextStripper();

                doc.Add(
new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new Field("contents", stripper.getText(pddoc), Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);
            }
            
else if (file.Extension.ToLower() == ".doc")
            {
                Document doc 
= new Document();
                
string str = "";
                
//
                Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
                
object filePath = file.FullName;
                
object nullobj = System.Reflection.Missing.Value;
                Microsoft.Office.Interop.Word.Document docdoc 
= wordApp.Documents.Open(
                    
ref filePath, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                    
ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                    
ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                docdoc.ActiveWindow.Selection.WholeStory();

                str 
= docdoc.ActiveWindow.Selection.Text.ToString();
                docdoc.Close(
ref nullobj, ref nullobj, ref nullobj);
                wordApp.Quit(
ref nullobj, ref nullobj, ref nullobj);
                
//

                doc.Add(
new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);

            }
            
else if (file.Extension.ToLower() == ".rtf")    //word的方式可以解决rtf文件的读取
            {
                Document doc 
= new Document();
                
string str = "";
                
//
                Microsoft.Office.Interop.Word.ApplicationClass wordApp = new Microsoft.Office.Interop.Word.ApplicationClass();
                
object filePath = file.FullName;
                
object nullobj = System.Reflection.Missing.Value;
                Microsoft.Office.Interop.Word.Document docdoc 
= wordApp.Documents.Open(
                    
ref filePath, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                    
ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                    
ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                docdoc.ActiveWindow.Selection.WholeStory();

                str 
= docdoc.ActiveWindow.Selection.Text.ToString();
                docdoc.Close(
ref nullobj, ref nullobj, ref nullobj);
                wordApp.Quit(
ref nullobj, ref nullobj, ref nullobj);
                
//
                
                doc.Add(
new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);
            }
            
else if (file.Extension.ToLower() == ".ppt")
            {
                Document doc 
= new Document();
                
string str = "";
                
//
                PowerPoint.ApplicationClass pptApp = new PowerPoint.ApplicationClass();
                PowerPoint.Presentation pptPre 
= pptApp.Presentations.Open(file.FullName,
                            Microsoft.Office.Core.MsoTriState.msoTrue,
                            Microsoft.Office.Core.MsoTriState.msoFalse,
                            Microsoft.Office.Core.MsoTriState.msoFalse);

                
foreach (PowerPoint.Slide slide in pptPre.Slides)
                {
                    
foreach (PowerPoint.Shape shape in slide.Shapes)
                    {
                        
try
                        {
                            str 
= str + shape.TextFrame.TextRange.Text;
                        }
                        
catch { }
                    }
                }
                pptPre.Close();
                pptApp.Quit();
                
//

                doc.Add(
new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);
            }
            
else if (file.Extension.ToLower() == ".xls")
            {
                Document doc 
= new Document();
                
string str = "";

                
//
                Microsoft.Office.Interop.Excel.Application xApp = new Microsoft.Office.Interop.Excel.ApplicationClass();
                
//xApp.Visible = true;

                
object nullobj = System.Reflection.Missing.Value;

                Microsoft.Office.Interop.Excel.Workbook xBook 
= xApp.Workbooks._Open(file.FullName,
                nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj);

                Microsoft.Office.Interop.Excel.Worksheet xSheet;
                
int rcount, ccount;

                
for (int i = 0; i < xBook.Sheets.Count; i++)
                {
                    xSheet 
= (Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[i + 1];

                    rcount 
= xSheet.UsedRange.Rows.Count;
                    ccount 
= xSheet.UsedRange.Columns.Count;

                    
for (int m = 0; m < rcount; m++)
                    {
                        
for (int n = 0; n < ccount; n++)
                        {
                            str 
= str + ((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m + 1, n + 1]).Value2;
                        }
                    }

                }
                xSheet 
= null;
                xBook.Close(nullobj, nullobj, nullobj);
                xApp.Quit();
                
//

                doc.Add(
new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new Field("contents", str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);

            }
            
else if (file.Extension.ToLower() == ".htm" || file.Extension.ToLower() == ".html")
            {

                Document doc 
= new Document();
                
string str = "";
                str 
= NoHTML(File.ReadAllText(file.FullName));

                doc.Add(
new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new Field("contents"new StreamReader(file.FullName, System.Text.Encoding.Default)));

                writer.AddDocument(doc);
            }
            
else    //默认是文本文件
            {
                Document doc 
= new Document();

                doc.Add(
new Field("filename", file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new Field("contents"new StreamReader(file.FullName, System.Text.Encoding.Default)));

                writer.AddDocument(doc);
            }
        }

        
catch (FileNotFoundException fnfe)
        {
            TextBox4.Text 
= TextBox4.Text + fnfe.Message + "\n";
            
return;
        }
    }

    
public static string NoHTML(string Htmlstring)//过滤调html的标签
    {
        
//删除脚本 
        Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>""", RegexOptions.IgnoreCase);
        
//删除HTML 
        Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>""", RegexOptions.IgnoreCase);
        Htmlstring 
= Regex.Replace(Htmlstring, @"([\r\n])[\s]+""", RegexOptions.IgnoreCase);
        Htmlstring 
= Regex.Replace(Htmlstring, @"-->""", RegexOptions.IgnoreCase);
        Htmlstring 
= Regex.Replace(Htmlstring, @"<!--.*""", RegexOptions.IgnoreCase);
        Htmlstring 
= Regex.Replace(Htmlstring, @"&(quot|#34);""\"", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);""&", RegexOptions.IgnoreCase);
        Htmlstring 
= Regex.Replace(Htmlstring, @"&(lt|#60);""<", RegexOptions.IgnoreCase);
        Htmlstring 
= Regex.Replace(Htmlstring, @"&(gt|#62);"">", RegexOptions.IgnoreCase);
        Htmlstring 
= Regex.Replace(Htmlstring, @"&(nbsp|#160);"" ", RegexOptions.IgnoreCase);
        Htmlstring 
= Regex.Replace(Htmlstring, @"&(iexcl|#161);""\xa1", RegexOptions.IgnoreCase);
        Htmlstring 
= Regex.Replace(Htmlstring, @"&(cent|#162);""\xa2", RegexOptions.IgnoreCase);
        Htmlstring 
= Regex.Replace(Htmlstring, @"&(pound|#163);""\xa3", RegexOptions.IgnoreCase);
        Htmlstring 
= Regex.Replace(Htmlstring, @"&(copy|#169);""\xa9", RegexOptions.IgnoreCase);
        Htmlstring 
= Regex.Replace(Htmlstring, @"&#(\d+);""", RegexOptions.IgnoreCase);
        Htmlstring.Replace(
"<""");
        Htmlstring.Replace(
">""");
        Htmlstring.Replace(
"\r\n""");
        Htmlstring 
= HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
        
return Htmlstring;
    }
    
#endregion

    
#region 搜索
    
protected void Button1_Click(object sender, EventArgs e)
    {
        
string INDEX_STORE_PATH = Server.MapPath("index");  //INDEX_STORE_PATH 为索引存储目录
        string KEYWORD = TextBox2.Text;

        
try
        {
            searcher 
= new IndexSearcher(INDEX_STORE_PATH);

            QueryParser q 
= new QueryParser("contents"new ChineseAnalyzer());

            Query query 
= q.Parse(KEYWORD);


            Hits hits 
= searcher.Search(query);

            printResult(hits);

            searcher.Close();
        }
        
catch (Exception ex)
        {
            TextBox4.Text 
= TextBox4.Text + ex.Message.ToString();
        }
    }

    
void printResult(Hits h)
    {
        
string str = "";
        
if (h.Length() == 0)
        {
            str 
= str + "对不起,没有搜索到你要的结果。\n";
        }
        
else
        {
            
for (int i = 0; i < h.Length(); i++)
            {
                
try
                {
                    Document doc 
= h.Doc(i);
                    str 
= str + "这是第" + (i + 1+ "个搜索结果,文件路径为: " + doc.Get("filename"+ "\n";
                }
                
catch (Exception ex)
                {
                    TextBox4.Text 
= TextBox4.Text + ex.Message;
                }
            }
        }
        str 
= str + "---------------------------\n";
        TextBox1.Text 
= str;
    }

    
#endregion

}


完整demo下载,点击下载

编辑推荐:
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 单线程的Redis速度为什么快?
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 展开说说关于C#中ORM框架的用法!
点击右上角即可分享
微信分享提示