一、面向对象程序设计

分为两部分,搜索对象设计和限制,服务层次设计引用。

package news;

/** * <p>Title: 新闻搜索引擎</p>
* <p>Copyright: Copyright (c) 2003</p> * <p>Company: </p> * @author 计算机99630 沈晨 * @version 1.0 * @Download:http://www.codefans.net */

import java.io.IOException;

import org.apache.lucene.analysis.cn.ChineseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter;

public class Index {

  IndexWriter _writer = null;   Index() throws Exception {     _writer = new IndexWriter("c:\\News\\index",                               new ChineseAnalyzer(), true);   }

  /**    * 把每条新闻加入索引中 * @param url 新闻的url    * @param title 新闻的标题 * @throws java.lang.Exception    */   void AddNews(String url, String title) throws Exception {     Document _doc = new Document();     _doc.add(Field.Text("title", title));     _doc.add(Field.UnIndexed("url", url));     _writer.addDocument(_doc);   }

  /**    * 优化并且清理资源 * @throws java.lang.Exception    */   void close() throws Exception {     _writer.optimize();     _writer.close();   } }

 

 

package news;

/** * <p>Title: 新闻搜索引擎</p>
* <p>Copyright: Copyright (c) 2003</p> * <p>Company: </p> * @author 计算机99630 沈晨 * @version 1.0 * @Download:http://www.codefans.net */

import java.util.Iterator; import java.util.Vector;

import com.heaton.bot.HTMLPage; import com.heaton.bot.HTTP; import com.heaton.bot.Link;

public class HTMLParse {   HTTP _http = null;   public HTMLParse(HTTP http) {     _http = http;   }

  /**    * 对Web页面进行解析后建立索引 */ public void start() {     try {       HTMLPage _page = new HTMLPage(_http);       _page.open(_http.getURL(), null);       Vector _links = _page.getLinks();       Index _index = new Index();       Iterator _it = _links.iterator();       int n = 0;       while (_it.hasNext()) {         Link _link = (Link) _it.next();         String _herf = input(_link.getHREF().trim());         String _title = input(_link.getPrompt().trim());         _index.AddNews(_herf, _title);         n++;       }       System.out.println("共扫描到" + n + "条新闻");       _index.close();     }     catch (Exception ex) {       System.out.println(ex);     }   }   /**    * 解决java中的中文问题 * @param str 输入的中文 * @return 经过解码的中文 */ public static String input(String str) {     String temp = null;     if (str != null) {       try {         temp = new String(str.getBytes("ISO8859_1"));       }       catch (Exception e) {       }     }     return temp;   }

}

 

 

 

package news;

/** * <p>Title: 新闻搜索引擎</p>
* <p>Copyright: Copyright (c) 2003</p> * <p>Company: </p>
* @version 1.0   * @Download:http://www.codefans.net */

import com.heaton.bot.HTTP; import com.heaton.bot.HTTPSocket; import com.heaton.bot.ISpiderReportable; import com.heaton.bot.IWorkloadStorable; import com.heaton.bot.Spider; import com.heaton.bot.SpiderInternalWorkload;

public class Searcher     implements ISpiderReportable {   public static void main(String[] args) throws Exception {     IWorkloadStorable wl = new SpiderInternalWorkload();     Searcher _searcher = new Searcher();     Spider _spider         = new Spider(_searcher, "http://www.chenshen.com/index.html",                      new HTTPSocket(), 100, wl);     _spider.setMaxBody(100);     _spider.start();   }   public boolean foundInternalLink(String url) {     return false;   }   public boolean foundExternalLink(String url) {     return false;   }   public boolean foundOtherLink(String url) {     return false;   }   public void processPage(HTTP http) {     System.out.println("扫描网页:" + http.getURL());     new HTMLParse(http).start();   }   public void completePage(HTTP http, boolean error) {   }   public boolean getRemoveQuery() {     return true;   }   public void spiderComplete() {   } }

 

 

package newsserver;

import java.io.IOException; import java.io.PrintWriter; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse;

import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.cn.ChineseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query;

/** * <p>Title: 新闻搜索引擎</p>
* <p>Copyright: Copyright (c) 2003</p> * <p>Company: </p>
* @version 1.0 * @Download:http://www.codefans.net */

public class Results     extends HttpServlet { private static final String CONTENT_TYPE = "text/html; charset=GBK";   //Initialize global variables   public void init() throws ServletException {   }

  //Process the HTTP Get request   public void doGet(HttpServletRequest request, HttpServletResponse response) throws       ServletException, IOException {     String QC = request.getParameter("QueryContent");     if (QC == null) {       QC = "";     }     else {       QC = input(QC);     }     response.setContentType(CONTENT_TYPE);     PrintWriter out = response.getWriter();     try {       Search(QC, out);     }     catch (Exception ex) {       System.out.println(ex.getMessage());     }   }

  public void Search(String qc, PrintWriter out) throws Exception {     // 从索引目录创建索引 IndexSearcher _searcher = new IndexSearcher("c:\\news\\index");     // 创建标准分析器 Analyzer analyzer = new ChineseAnalyzer();     // 查询条件 String line = qc;     // Query是一个抽象类 Query query = QueryParser.parse(line, "title", analyzer);

    out.println("<html>");     out.println("<head><title>搜索结果</title></head>");     out.println("<body bgcolor=#ffffff>");     out.println("<center>" +                 "<form action='/NewsServer/results' method='get'>" +                 "<font face='华文中宋' color='#3399FF'>新闻搜索引擎</font>:" +                 "<input type='text' name='QueryContent' size='20'>" +                 "<input type='submit' name='submit' value='开始搜索'>" +                 "</form></center>"                 );     out.println("<p>搜索关键字:<font color=red>" + query.toString("title") +                 "</font></p>");     Hits hits = _searcher.search(query);     out.println(" 总共找到<font color=red>" + hits.length() + "</font>条新闻<br>");

    final int HITS_PER_PAGE = 10;     for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {       int end = Math.min(hits.length(), start + HITS_PER_PAGE);       for (int i = start; i < end; i++) {         Document doc = hits.doc(i);         String url = doc.get("url");         if (url != null) {           out.println( (i + 1) + " <a href='" + url + "'>" +                       replace(doc.get("title"), qc) +                       "</a><br>");         }         else {           System.out.println("没有找到!");         }       }     } out.println("</body></html>");     _searcher.close();   };

  public String input(String str) {     String temp = null;     if (str != null) {       try {         temp = new String(str.getBytes("ISO8859_1"));       }       catch (Exception e) {       }     }     return temp;   }

  public String replace(String title, String keyword) {     return title.replaceAll(keyword, "<font color='red'>" + keyword + "</font>");   };

  //Clean up resources   public void destroy() {   } }