using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml;
namespace SearchEngineConsoleApp
class Program
public static void Main()
string strcode;
ArrayList allinks;
string strurl = "http://www.cnblogs.com/OceanChen/";
if (strurl.Substring(0, 7) != @"http://")
strurl = @"http://" + strurl;
strcode = getpagesource(strurl);
allinks = gethyperlinks(strcode);
Writetoxml(strurl, allinks);
// 获取指定网页的html代码
static string getpagesource(string url)
Uri uri = new Uri(url);
HttpWebRequest hwreq = (HttpWebRequest)WebRequest.Create(uri);
HttpWebResponse hwres = (HttpWebResponse)hwreq.GetResponse();
hwreq.Method = "get";
hwreq.KeepAlive = false;
StreamReader reader = new StreamReader(hwres.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));
return reader.ReadToEnd();
// 提取html代码中的网址
static ArrayList gethyperlinks(string htmlcode)
ArrayList al = new ArrayList();
string strregex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
Regex r = new Regex(strregex, RegexOptions.IgnoreCase);
MatchCollection m = r.Matches(htmlcode);
for (int i = 0; i <= m.Count - 1; i++)
bool rep = false;
string strnew = m[i].ToString();
// 过滤重复的url
foreach (string str in al)
if (strnew == str)
rep = true;
if (!rep) al.Add(strnew);
return al;
// 把网址写入xml文件
static void Writetoxml(string strurl, ArrayList alhyperlinks)
XmlTextWriter Writer = new XmlTextWriter("hyperlinks.xml", Encoding.UTF8);
Writer.Formatting = Formatting.Indented;
//Writer.WriteDocType("hyperlinks", null, "urls.dtd", null);
Writer.WriteComment("提取自" + strurl + "的超链接");
Writer.WriteStartElement("hyperlinks", null);
Writer.WriteAttributeString("datetime", DateTime.Now.ToString());
foreach (string str in alhyperlinks)
string title = getdomain(str);
string body = str;
Writer.WriteElementString(title, null, body);
// 获取网址的域名后缀
static string getdomain(string strurl)
string retval;
string strregex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";
Regex r = new Regex(strregex, RegexOptions.IgnoreCase);
Match m = r.Match(strurl);
retval = m.ToString();
strregex = @"\.|/$";
retval = Regex.Replace(retval, strregex, "").ToString();
if (retval == "")
retval = "other";
return retval;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml;
namespace SearchEngineConsoleApp
class Program
public static void Main()
string strcode;
ArrayList allinks;
string strurl = "http://www.cnblogs.com/OceanChen/";
if (strurl.Substring(0, 7) != @"http://")
strurl = @"http://" + strurl;
strcode = getpagesource(strurl);
allinks = gethyperlinks(strcode);
Writetoxml(strurl, allinks);
// 获取指定网页的html代码
static string getpagesource(string url)
Uri uri = new Uri(url);
HttpWebRequest hwreq = (HttpWebRequest)WebRequest.Create(uri);
HttpWebResponse hwres = (HttpWebResponse)hwreq.GetResponse();
hwreq.Method = "get";
hwreq.KeepAlive = false;
StreamReader reader = new StreamReader(hwres.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));
return reader.ReadToEnd();
// 提取html代码中的网址
static ArrayList gethyperlinks(string htmlcode)
ArrayList al = new ArrayList();
string strregex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
Regex r = new Regex(strregex, RegexOptions.IgnoreCase);
MatchCollection m = r.Matches(htmlcode);
for (int i = 0; i <= m.Count - 1; i++)
bool rep = false;
string strnew = m[i].ToString();
// 过滤重复的url
foreach (string str in al)
if (strnew == str)
rep = true;
if (!rep) al.Add(strnew);
return al;
// 把网址写入xml文件
static void Writetoxml(string strurl, ArrayList alhyperlinks)
XmlTextWriter Writer = new XmlTextWriter("hyperlinks.xml", Encoding.UTF8);
Writer.Formatting = Formatting.Indented;
//Writer.WriteDocType("hyperlinks", null, "urls.dtd", null);
Writer.WriteComment("提取自" + strurl + "的超链接");
Writer.WriteStartElement("hyperlinks", null);
Writer.WriteAttributeString("datetime", DateTime.Now.ToString());
foreach (string str in alhyperlinks)
string title = getdomain(str);
string body = str;
Writer.WriteElementString(title, null, body);
// 获取网址的域名后缀
static string getdomain(string strurl)
string retval;
string strregex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";
Regex r = new Regex(strregex, RegexOptions.IgnoreCase);
Match m = r.Match(strurl);
retval = m.ToString();
strregex = @"\.|/$";
retval = Regex.Replace(retval, strregex, "").ToString();
if (retval == "")
retval = "other";
return retval;
Search.aspx 页面代码如下:
<%@ Page Language="C#" AutoEventWireup="false" Inherits="SearchEngine" Src="search.aspx.cs" %>
<script language="c#" runat="server">
protected void search(Object sender, EventArgs e)
if (SearchWebSites(keyword.Text, urls.Text))
info.Text = "Searched <font color=\"red\">" + SearchResults.Count + "</font> web page(s) ";
info.Text += "on the keyword <font color=\"red\">\"" + keyword.Text + "</font>\". ";
info.Text += "Total search time was <font color=\"red\">" + timeSpent + "</font>";
SearchForm.Visible = false;
ResultList.DataSource = SearchResults;
<title>Multi-threaded Search Engine</title>
font-family: verdana;
font-size: 12px;
color: 333333;
<asp:Label ID="info" class="BodyText" Text="URL of the web sites to search, one url per line."
runat="server" /><br />
<asp:Repeater ID="ResultList" runat="server">
<table class="BodyText" border="0" cellpadding="3" cellspacing="3">
<b>Web Page Title</b>
<b>Web Page URL</b>
<b>Searched Time</b>
<%# DataBinder.Eval(Container.DataItem, "instanceCount") %>
<%# DataBinder.Eval(Container.DataItem, "pageTitle") %>
<%# DataBinder.Eval(Container.DataItem, "pageURL") %>
<%# DataBinder.Eval(Container.DataItem, "timeSpent") %>
<form id="SearchForm" runat="server">
<table class="BodyText">
<asp:TextBox class="BodyText" Text="news" ID="keyword" runat="server" />
<td valign="top">
<asp:TextBox class="BodyText" Text="" ID="urls" Rows="10" Columns="30" TextMode="MultiLine"
runat="server" />
<td align="right" colspan="2">
<asp:Button class="BodyText" Text="search!" type="submit" OnClick="search" runat="server"
ID="Button1" />
<%@ Page Language="C#" AutoEventWireup="false" Inherits="SearchEngine" Src="search.aspx.cs" %>
<script language="c#" runat="server">
protected void search(Object sender, EventArgs e)
if (SearchWebSites(keyword.Text, urls.Text))
info.Text = "Searched <font color=\"red\">" + SearchResults.Count + "</font> web page(s) ";
info.Text += "on the keyword <font color=\"red\">\"" + keyword.Text + "</font>\". ";
info.Text += "Total search time was <font color=\"red\">" + timeSpent + "</font>";
SearchForm.Visible = false;
ResultList.DataSource = SearchResults;
<title>Multi-threaded Search Engine</title>
font-family: verdana;
font-size: 12px;
color: 333333;
<asp:Label ID="info" class="BodyText" Text="URL of the web sites to search, one url per line."
runat="server" /><br />
<asp:Repeater ID="ResultList" runat="server">
<table class="BodyText" border="0" cellpadding="3" cellspacing="3">
<b>Web Page Title</b>
<b>Web Page URL</b>
<b>Searched Time</b>
<%# DataBinder.Eval(Container.DataItem, "instanceCount") %>
<%# DataBinder.Eval(Container.DataItem, "pageTitle") %>
<%# DataBinder.Eval(Container.DataItem, "pageURL") %>
<%# DataBinder.Eval(Container.DataItem, "timeSpent") %>
<form id="SearchForm" runat="server">
<table class="BodyText">
<asp:TextBox class="BodyText" Text="news" ID="keyword" runat="server" />
<td valign="top">
<asp:TextBox class="BodyText" Text="" ID="urls" Rows="10" Columns="30" TextMode="MultiLine"
runat="server" />
<td align="right" colspan="2">
<asp:Button class="BodyText" Text="search!" type="submit" OnClick="search" runat="server"
ID="Button1" />
Search.aspx.cs 后台代码如下:
using System;
using System.IO;
using System.Net;
using System.Web;
using System.Web.UI;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Threading;
public class SearchEngine : Page
// private member fields.
private ArrayList _pages;
private TimeSpan _timeSpent;
/// <summary>
/// Returns an ArrayList of WebPage objects,
/// which contains the search results information.
/// </summary>
public ArrayList SearchResults
get { return _pages; }
/// <summary>
/// A TimeSpan object. It lets us know how long was the entire search.
/// </summary>
public TimeSpan timeSpent
get { return _timeSpent; }
/// <summary>
/// Start searching the web sites.
/// </summary>
/// <param name="keyword">The keyword to search for.</param>
/// <param name="pURLs">List of URLs, seperated by the \n character.</param>
/// <returns></returns>
public bool SearchWebSites(string keyword, string pURLs)
// start the timer
DateTime lStarted = DateTime.Now;
_pages = new ArrayList();
// split the urls string to an array
string[] lURLs = pURLs.Split('\n');
int lIdx;
WebPage wp;
// create the Thread array
Thread[] t = new Thread[lURLs.Length];
for (lIdx = 0; lIdx < lURLs.Length; lIdx++)
// create a WebPage object for each url
wp = new WebPage(keyword, lURLs[lIdx]);
// add it to the _pages ArrayList
// pass the search() method of the new WebPage object
// to the ThreadStart object. Then pass the ThreadStart
// object to the Thread object.
t[lIdx] = new Thread(new ThreadStart(wp.search));
// start the Thread object, which executes the search().
for (lIdx = 0; lIdx < _pages.Count; lIdx++)
// waiting for all the Threads to finish.
// stop the timer.
_timeSpent = DateTime.Now.Subtract(lStarted);
return true;
/// <summary>
/// The class that contains information for each searched web page.
/// </summary>
public class WebPage
// private member fields.
private int _instanceCount;
private string _pageURL;
private string _pageTitle;
private string _keyword;
private TimeSpan _timeSpent;
/// <summary>
/// A TimeSpan object. It lets us know how long was the page search.
/// </summary>
public TimeSpan timeSpent
get { return _timeSpent; }
/// <summary>
/// How many times the search keyword appears on the page.
/// </summary>
public int instanceCount
get { return _instanceCount; }
/// <summary>
/// The URL of the search page
/// </summary>
public string pageURL
get { return _pageURL; }
/// <summary>
/// The title of the search page
/// </summary>
public string pageTitle
get { return _pageTitle; }
public WebPage() { }
/// <summary>
/// A parameterized constructor of the WebPage class.
/// </summary>
/// <param name="keyword">The keyword to search for.</param>
/// <param name="pageURL">The URL to connect to.</param>
public WebPage(string keyword, string pageURL)
_keyword = keyword;
_pageURL = pageURL;
/// <summary>
/// This method connects to the searching page, and retrieve the page content.
/// It then passes the content to various private methods to perform other operations.
/// </summary>
public void search()
// start timing it
DateTime lStarted = DateTime.Now;
// create the WebRequest
WebRequest webreq = WebRequest.Create(_pageURL);
// connect to the page, and get its response
WebResponse webresp = webreq.GetResponse();
// wrap the response stream to a stream reader
StreamReader sr = new StreamReader(webresp.GetResponseStream(), Encoding.ASCII);
StringBuilder sb = new StringBuilder();
string line;
while ((line = sr.ReadLine()) != null)
// append each line the server sends, to the string builder
string pageCode = sb.ToString();
// get the page title
_pageTitle = getPageTitle(pageCode);
// get the amount of time the keyword appeared on the page
_instanceCount = countInstance(getPureContent(pageCode));
// stop the timer
_timeSpent = DateTime.Now.Subtract(lStarted);
// this method uses the regular expression to match the keyword.
// it then count the matches to find out how many times the keyword appeared on the page.
private int countInstance(string str)
string lPattern = "(" + _keyword + ")";
int count = 0;
Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
StringBuilder sb = new StringBuilder();
Match mt;
for (mt = rx.Match(str); mt.Success; mt = mt.NextMatch())
return count;
// this method uses the regular expression to match the pattern that represent all
// string enclosed between ">" and "<". It removes all the HTML tags,
// and only returns the HTML decoded content string.
private string getPureContent(string str)
string lPattern = ">(?:(?<c>[^<]+))";
Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
StringBuilder sb = new StringBuilder();
Match mt;
for (mt = rx.Match(str); mt.Success; mt = mt.NextMatch())
sb.Append(" ");
return sb.ToString();
// this method uses the regular expression to match the pattern that represent the
// HTML Title tag of the page. It only returns the first match, and ignores the rest.
private string getPageTitle(string str)
string lTitle = "";
string lPattern = "(?:<\\s*title\\s*>(?<t>[^<]+))";
Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
Match mt = rx.Match(str);
if (mt.Success)
lTitle = mt.Groups["t"].Value.ToString();
lTitle = "";
lTitle = "";
return lTitle;
using System;
using System.IO;
using System.Net;
using System.Web;
using System.Web.UI;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Threading;
public class SearchEngine : Page
// private member fields.
private ArrayList _pages;
private TimeSpan _timeSpent;
/// <summary>
/// Returns an ArrayList of WebPage objects,
/// which contains the search results information.
/// </summary>
public ArrayList SearchResults
get { return _pages; }
/// <summary>
/// A TimeSpan object. It lets us know how long was the entire search.
/// </summary>
public TimeSpan timeSpent
get { return _timeSpent; }
/// <summary>
/// Start searching the web sites.
/// </summary>
/// <param name="keyword">The keyword to search for.</param>
/// <param name="pURLs">List of URLs, seperated by the \n character.</param>
/// <returns></returns>
public bool SearchWebSites(string keyword, string pURLs)
// start the timer
DateTime lStarted = DateTime.Now;
_pages = new ArrayList();
// split the urls string to an array
string[] lURLs = pURLs.Split('\n');
int lIdx;
WebPage wp;
// create the Thread array
Thread[] t = new Thread[lURLs.Length];
for (lIdx = 0; lIdx < lURLs.Length; lIdx++)
// create a WebPage object for each url
wp = new WebPage(keyword, lURLs[lIdx]);
// add it to the _pages ArrayList
// pass the search() method of the new WebPage object
// to the ThreadStart object. Then pass the ThreadStart
// object to the Thread object.
t[lIdx] = new Thread(new ThreadStart(wp.search));
// start the Thread object, which executes the search().
for (lIdx = 0; lIdx < _pages.Count; lIdx++)
// waiting for all the Threads to finish.
// stop the timer.
_timeSpent = DateTime.Now.Subtract(lStarted);
return true;
/// <summary>
/// The class that contains information for each searched web page.
/// </summary>
public class WebPage
// private member fields.
private int _instanceCount;
private string _pageURL;
private string _pageTitle;
private string _keyword;
private TimeSpan _timeSpent;
/// <summary>
/// A TimeSpan object. It lets us know how long was the page search.
/// </summary>
public TimeSpan timeSpent
get { return _timeSpent; }
/// <summary>
/// How many times the search keyword appears on the page.
/// </summary>
public int instanceCount
get { return _instanceCount; }
/// <summary>
/// The URL of the search page
/// </summary>
public string pageURL
get { return _pageURL; }
/// <summary>
/// The title of the search page
/// </summary>
public string pageTitle
get { return _pageTitle; }
public WebPage() { }
/// <summary>
/// A parameterized constructor of the WebPage class.
/// </summary>
/// <param name="keyword">The keyword to search for.</param>
/// <param name="pageURL">The URL to connect to.</param>
public WebPage(string keyword, string pageURL)
_keyword = keyword;
_pageURL = pageURL;
/// <summary>
/// This method connects to the searching page, and retrieve the page content.
/// It then passes the content to various private methods to perform other operations.
/// </summary>
public void search()
// start timing it
DateTime lStarted = DateTime.Now;
// create the WebRequest
WebRequest webreq = WebRequest.Create(_pageURL);
// connect to the page, and get its response
WebResponse webresp = webreq.GetResponse();
// wrap the response stream to a stream reader
StreamReader sr = new StreamReader(webresp.GetResponseStream(), Encoding.ASCII);
StringBuilder sb = new StringBuilder();
string line;
while ((line = sr.ReadLine()) != null)
// append each line the server sends, to the string builder
string pageCode = sb.ToString();
// get the page title
_pageTitle = getPageTitle(pageCode);
// get the amount of time the keyword appeared on the page
_instanceCount = countInstance(getPureContent(pageCode));
// stop the timer
_timeSpent = DateTime.Now.Subtract(lStarted);
// this method uses the regular expression to match the keyword.
// it then count the matches to find out how many times the keyword appeared on the page.
private int countInstance(string str)
string lPattern = "(" + _keyword + ")";
int count = 0;
Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
StringBuilder sb = new StringBuilder();
Match mt;
for (mt = rx.Match(str); mt.Success; mt = mt.NextMatch())
return count;
// this method uses the regular expression to match the pattern that represent all
// string enclosed between ">" and "<". It removes all the HTML tags,
// and only returns the HTML decoded content string.
private string getPureContent(string str)
string lPattern = ">(?:(?<c>[^<]+))";
Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
StringBuilder sb = new StringBuilder();
Match mt;
for (mt = rx.Match(str); mt.Success; mt = mt.NextMatch())
sb.Append(" ");
return sb.ToString();
// this method uses the regular expression to match the pattern that represent the
// HTML Title tag of the page. It only returns the first match, and ignores the rest.
private string getPageTitle(string str)
string lTitle = "";
string lPattern = "(?:<\\s*title\\s*>(?<t>[^<]+))";
Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
Match mt = rx.Match(str);
if (mt.Success)
lTitle = mt.Groups["t"].Value.ToString();
lTitle = "";
lTitle = "";
return lTitle;
http://www.codeproject.com/KB/applications/SearchDotnet.aspx (Internal Site Search Engine 测试通过)
http://www.codeproject.com/KB/IP/Searcharoo_4.aspx (C# search engine: refactored to search Word, PDF and more)
测试通过,不过有点小麻烦,得学会序列化和反序列化(这里采用二进制)。 解决问题方法是:
1. 随便找4个文件,分别命名为:plaintext.txt,Kilimanjaro.pdf,Decorator.ppt,Marathoning.doc 放在目录 content 中;
2. 根据错误提示(e文,注意e文提示上面的链接,那个链接告诉你如何序列化和反序列化二进制文件以及XML文件),创建二进制文件 z_searcharoo.dat 放在特定的目录下,然后RUN!