【原】获取网页所有链接——简易搜索引擎雏形(C#,已测试通过,有CODE)
最近一直在看搜索引擎方面的资料,在Google上找了很多,可是没有找到一个下载下来可以直接使用的!
没有办法只能看别人的代码,然后一点一点的调试!功夫不负有心人,终于算是修得正果!贴代码:
最近一直在看搜索引擎方面的资料,在Google上找了很多,可是没有找到一个下载下来可以直接使用的!
没有办法只能看别人的代码,然后一点一点的调试!功夫不负有心人,终于算是修得正果!贴代码:
Code
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml;
namespace SearchEngineConsoleApp
{
class Program
{
public static void Main()
{
string strcode;
ArrayList allinks;
Console.Write("请输入一个网页地址:");
string strurl = "http://www.cnblogs.com/OceanChen/";
if (strurl.Substring(0, 7) != @"http://")
{
strurl = @"http://" + strurl;
}
Console.WriteLine("正在获取页面代码,请稍侯");
strcode = getpagesource(strurl);
Console.WriteLine("正在提取超链接,请稍侯");
allinks = gethyperlinks(strcode);
Console.WriteLine("正在写入文件,请稍侯");
Writetoxml(strurl, allinks);
}
// 获取指定网页的html代码
static string getpagesource(string url)
{
Uri uri = new Uri(url);
HttpWebRequest hwreq = (HttpWebRequest)WebRequest.Create(uri);
HttpWebResponse hwres = (HttpWebResponse)hwreq.GetResponse();
hwreq.Method = "get";
hwreq.KeepAlive = false;
StreamReader reader = new StreamReader(hwres.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));
return reader.ReadToEnd();
}
// 提取html代码中的网址
static ArrayList gethyperlinks(string htmlcode)
{
ArrayList al = new ArrayList();
string strregex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
Regex r = new Regex(strregex, RegexOptions.IgnoreCase);
MatchCollection m = r.Matches(htmlcode);
for (int i = 0; i <= m.Count - 1; i++)
{
bool rep = false;
string strnew = m[i].ToString();
// 过滤重复的url
foreach (string str in al)
{
if (strnew == str)
{
rep = true;
break;
}
}
if (!rep) al.Add(strnew);
}
al.Sort();
return al;
}
// 把网址写入xml文件
static void Writetoxml(string strurl, ArrayList alhyperlinks)
{
XmlTextWriter Writer = new XmlTextWriter("hyperlinks.xml", Encoding.UTF8);
Writer.Formatting = Formatting.Indented;
Writer.WriteStartDocument(false);
//Writer.WriteDocType("hyperlinks", null, "urls.dtd", null);
Writer.WriteComment("提取自" + strurl + "的超链接");
Writer.WriteStartElement("hyperlinks");
Writer.WriteStartElement("hyperlinks", null);
Writer.WriteAttributeString("datetime", DateTime.Now.ToString());
foreach (string str in alhyperlinks)
{
string title = getdomain(str);
string body = str;
Writer.WriteElementString(title, null, body);
}
Writer.WriteEndElement();
Writer.WriteEndElement();
Writer.Flush();
Writer.Close();
}
// 获取网址的域名后缀
static string getdomain(string strurl)
{
string retval;
string strregex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";
Regex r = new Regex(strregex, RegexOptions.IgnoreCase);
Match m = r.Match(strurl);
retval = m.ToString();
strregex = @"\.|/$";
retval = Regex.Replace(retval, strregex, "").ToString();
if (retval == "")
retval = "other";
return retval;
}
}
}
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml;
namespace SearchEngineConsoleApp
{
class Program
{
public static void Main()
{
string strcode;
ArrayList allinks;
Console.Write("请输入一个网页地址:");
string strurl = "http://www.cnblogs.com/OceanChen/";
if (strurl.Substring(0, 7) != @"http://")
{
strurl = @"http://" + strurl;
}
Console.WriteLine("正在获取页面代码,请稍侯");
strcode = getpagesource(strurl);
Console.WriteLine("正在提取超链接,请稍侯");
allinks = gethyperlinks(strcode);
Console.WriteLine("正在写入文件,请稍侯");
Writetoxml(strurl, allinks);
}
// 获取指定网页的html代码
static string getpagesource(string url)
{
Uri uri = new Uri(url);
HttpWebRequest hwreq = (HttpWebRequest)WebRequest.Create(uri);
HttpWebResponse hwres = (HttpWebResponse)hwreq.GetResponse();
hwreq.Method = "get";
hwreq.KeepAlive = false;
StreamReader reader = new StreamReader(hwres.GetResponseStream(), System.Text.Encoding.GetEncoding("gb2312"));
return reader.ReadToEnd();
}
// 提取html代码中的网址
static ArrayList gethyperlinks(string htmlcode)
{
ArrayList al = new ArrayList();
string strregex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";
Regex r = new Regex(strregex, RegexOptions.IgnoreCase);
MatchCollection m = r.Matches(htmlcode);
for (int i = 0; i <= m.Count - 1; i++)
{
bool rep = false;
string strnew = m[i].ToString();
// 过滤重复的url
foreach (string str in al)
{
if (strnew == str)
{
rep = true;
break;
}
}
if (!rep) al.Add(strnew);
}
al.Sort();
return al;
}
// 把网址写入xml文件
static void Writetoxml(string strurl, ArrayList alhyperlinks)
{
XmlTextWriter Writer = new XmlTextWriter("hyperlinks.xml", Encoding.UTF8);
Writer.Formatting = Formatting.Indented;
Writer.WriteStartDocument(false);
//Writer.WriteDocType("hyperlinks", null, "urls.dtd", null);
Writer.WriteComment("提取自" + strurl + "的超链接");
Writer.WriteStartElement("hyperlinks");
Writer.WriteStartElement("hyperlinks", null);
Writer.WriteAttributeString("datetime", DateTime.Now.ToString());
foreach (string str in alhyperlinks)
{
string title = getdomain(str);
string body = str;
Writer.WriteElementString(title, null, body);
}
Writer.WriteEndElement();
Writer.WriteEndElement();
Writer.Flush();
Writer.Close();
}
// 获取网址的域名后缀
static string getdomain(string strurl)
{
string retval;
string strregex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";
Regex r = new Regex(strregex, RegexOptions.IgnoreCase);
Match m = r.Match(strurl);
retval = m.ToString();
strregex = @"\.|/$";
retval = Regex.Replace(retval, strregex, "").ToString();
if (retval == "")
retval = "other";
return retval;
}
}
}
这个代码只是实现了部分功能,即单页面抓取不重复链接,看来还有部分工作要做,就是页面内部循环,利用正则抓取页单一站点的整体链接,目的是为了收集自己最感兴趣的内容。
为了使读者有信心,继续贴一些重量级别的代码过来,这段代码是统计一个URL内关键字出现的次数,测试通过!见代码:
Search.aspx 页面代码如下:
Code
<%@ Page Language="C#" AutoEventWireup="false" Inherits="SearchEngine" Src="search.aspx.cs" %>
<script language="c#" runat="server">
protected void search(Object sender, EventArgs e)
{
if (SearchWebSites(keyword.Text, urls.Text))
{
info.Text = "Searched <font color=\"red\">" + SearchResults.Count + "</font> web page(s) ";
info.Text += "on the keyword <font color=\"red\">\"" + keyword.Text + "</font>\". ";
info.Text += "Total search time was <font color=\"red\">" + timeSpent + "</font>";
SearchForm.Visible = false;
ResultList.DataSource = SearchResults;
ResultList.DataBind();
}
}
</script>
<html>
<head>
<title>Multi-threaded Search Engine</title>
<style>
.BodyText
{
font-family: verdana;
font-size: 12px;
color: 333333;
}
</style>
</head>
<body>
<asp:Label ID="info" class="BodyText" Text="URL of the web sites to search, one url per line."
runat="server" /><br />
<asp:Repeater ID="ResultList" runat="server">
<HeaderTemplate>
<table class="BodyText" border="0" cellpadding="3" cellspacing="3">
<tr>
<td>
<b>Found</b>
</td>
<td>
<b>Web Page Title</b>
</td>
<td>
<b>Web Page URL</b>
</td>
<td>
<b>Searched Time</b>
</td>
</tr>
</HeaderTemplate>
<ItemTemplate>
<tr>
<td>
<%# DataBinder.Eval(Container.DataItem, "instanceCount") %>
</td>
<td>
<%# DataBinder.Eval(Container.DataItem, "pageTitle") %>
</td>
<td>
<%# DataBinder.Eval(Container.DataItem, "pageURL") %>
</td>
<td>
<%# DataBinder.Eval(Container.DataItem, "timeSpent") %>
</td>
</tr>
</ItemTemplate>
<FooterTemplate>
</table>
</FooterTemplate>
</asp:Repeater>
<form id="SearchForm" runat="server">
<table class="BodyText">
<tr>
<td>
keyword:
</td>
<td>
<asp:TextBox class="BodyText" Text="news" ID="keyword" runat="server" />
</td>
</tr>
<tr>
<td valign="top">
urls:
</td>
<td>
<asp:TextBox class="BodyText" Text="" ID="urls" Rows="10" Columns="30" TextMode="MultiLine"
runat="server" />
</td>
</tr>
<tr>
<td align="right" colspan="2">
<asp:Button class="BodyText" Text="search!" type="submit" OnClick="search" runat="server"
ID="Button1" />
</td>
</tr>
</table>
</form>
</body>
</html>
<%@ Page Language="C#" AutoEventWireup="false" Inherits="SearchEngine" Src="search.aspx.cs" %>
<script language="c#" runat="server">
protected void search(Object sender, EventArgs e)
{
if (SearchWebSites(keyword.Text, urls.Text))
{
info.Text = "Searched <font color=\"red\">" + SearchResults.Count + "</font> web page(s) ";
info.Text += "on the keyword <font color=\"red\">\"" + keyword.Text + "</font>\". ";
info.Text += "Total search time was <font color=\"red\">" + timeSpent + "</font>";
SearchForm.Visible = false;
ResultList.DataSource = SearchResults;
ResultList.DataBind();
}
}
</script>
<html>
<head>
<title>Multi-threaded Search Engine</title>
<style>
.BodyText
{
font-family: verdana;
font-size: 12px;
color: 333333;
}
</style>
</head>
<body>
<asp:Label ID="info" class="BodyText" Text="URL of the web sites to search, one url per line."
runat="server" /><br />
<asp:Repeater ID="ResultList" runat="server">
<HeaderTemplate>
<table class="BodyText" border="0" cellpadding="3" cellspacing="3">
<tr>
<td>
<b>Found</b>
</td>
<td>
<b>Web Page Title</b>
</td>
<td>
<b>Web Page URL</b>
</td>
<td>
<b>Searched Time</b>
</td>
</tr>
</HeaderTemplate>
<ItemTemplate>
<tr>
<td>
<%# DataBinder.Eval(Container.DataItem, "instanceCount") %>
</td>
<td>
<%# DataBinder.Eval(Container.DataItem, "pageTitle") %>
</td>
<td>
<%# DataBinder.Eval(Container.DataItem, "pageURL") %>
</td>
<td>
<%# DataBinder.Eval(Container.DataItem, "timeSpent") %>
</td>
</tr>
</ItemTemplate>
<FooterTemplate>
</table>
</FooterTemplate>
</asp:Repeater>
<form id="SearchForm" runat="server">
<table class="BodyText">
<tr>
<td>
keyword:
</td>
<td>
<asp:TextBox class="BodyText" Text="news" ID="keyword" runat="server" />
</td>
</tr>
<tr>
<td valign="top">
urls:
</td>
<td>
<asp:TextBox class="BodyText" Text="" ID="urls" Rows="10" Columns="30" TextMode="MultiLine"
runat="server" />
</td>
</tr>
<tr>
<td align="right" colspan="2">
<asp:Button class="BodyText" Text="search!" type="submit" OnClick="search" runat="server"
ID="Button1" />
</td>
</tr>
</table>
</form>
</body>
</html>
Search.aspx.cs 后台代码如下:
Code
using System;
using System.IO;
using System.Net;
using System.Web;
using System.Web.UI;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Threading;
public class SearchEngine : Page
{
// private member fields.
private ArrayList _pages;
private TimeSpan _timeSpent;
/// <summary>
/// Returns an ArrayList of WebPage objects,
/// which contains the search results information.
/// </summary>
public ArrayList SearchResults
{
get { return _pages; }
}
/// <summary>
/// A TimeSpan object. It lets us know how long was the entire search.
/// </summary>
public TimeSpan timeSpent
{
get { return _timeSpent; }
}
/// <summary>
/// Start searching the web sites.
/// </summary>
/// <param name="keyword">The keyword to search for.</param>
/// <param name="pURLs">List of URLs, seperated by the \n character.</param>
/// <returns></returns>
public bool SearchWebSites(string keyword, string pURLs)
{
// start the timer
DateTime lStarted = DateTime.Now;
_pages = new ArrayList();
// split the urls string to an array
string[] lURLs = pURLs.Split('\n');
int lIdx;
WebPage wp;
// create the Thread array
Thread[] t = new Thread[lURLs.Length];
for (lIdx = 0; lIdx < lURLs.Length; lIdx++)
{
// create a WebPage object for each url
wp = new WebPage(keyword, lURLs[lIdx]);
// add it to the _pages ArrayList
_pages.Add(wp);
// pass the search() method of the new WebPage object
// to the ThreadStart object. Then pass the ThreadStart
// object to the Thread object.
t[lIdx] = new Thread(new ThreadStart(wp.search));
// start the Thread object, which executes the search().
t[lIdx].Start();
}
for (lIdx = 0; lIdx < _pages.Count; lIdx++)
{
// waiting for all the Threads to finish.
t[lIdx].Join();
}
// stop the timer.
_timeSpent = DateTime.Now.Subtract(lStarted);
return true;
}
}
/// <summary>
/// The class that contains information for each searched web page.
/// </summary>
public class WebPage
{
// private member fields.
private int _instanceCount;
private string _pageURL;
private string _pageTitle;
private string _keyword;
private TimeSpan _timeSpent;
/// <summary>
/// A TimeSpan object. It lets us know how long was the page search.
/// </summary>
public TimeSpan timeSpent
{
get { return _timeSpent; }
}
/// <summary>
/// How many times the search keyword appears on the page.
/// </summary>
public int instanceCount
{
get { return _instanceCount; }
}
/// <summary>
/// The URL of the search page
/// </summary>
public string pageURL
{
get { return _pageURL; }
}
/// <summary>
/// The title of the search page
/// </summary>
public string pageTitle
{
get { return _pageTitle; }
}
public WebPage() { }
/// <summary>
/// A parameterized constructor of the WebPage class.
/// </summary>
/// <param name="keyword">The keyword to search for.</param>
/// <param name="pageURL">The URL to connect to.</param>
public WebPage(string keyword, string pageURL)
{
_keyword = keyword;
_pageURL = pageURL;
}
/// <summary>
/// This method connects to the searching page, and retrieve the page content.
/// It then passes the content to various private methods to perform other operations.
/// </summary>
public void search()
{
// start timing it
DateTime lStarted = DateTime.Now;
// create the WebRequest
WebRequest webreq = WebRequest.Create(_pageURL);
// connect to the page, and get its response
WebResponse webresp = webreq.GetResponse();
// wrap the response stream to a stream reader
StreamReader sr = new StreamReader(webresp.GetResponseStream(), Encoding.ASCII);
StringBuilder sb = new StringBuilder();
string line;
while ((line = sr.ReadLine()) != null)
{
// append each line the server sends, to the string builder
sb.Append(line);
}
sr.Close();
string pageCode = sb.ToString();
// get the page title
_pageTitle = getPageTitle(pageCode);
// get the amount of time the keyword appeared on the page
_instanceCount = countInstance(getPureContent(pageCode));
// stop the timer
_timeSpent = DateTime.Now.Subtract(lStarted);
}
// this method uses the regular expression to match the keyword.
// it then count the matches to find out how many times the keyword appeared on the page.
private int countInstance(string str)
{
string lPattern = "(" + _keyword + ")";
int count = 0;
Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
StringBuilder sb = new StringBuilder();
Match mt;
for (mt = rx.Match(str); mt.Success; mt = mt.NextMatch())
count++;
return count;
}
// this method uses the regular expression to match the pattern that represent all
// string enclosed between ">" and "<". It removes all the HTML tags,
// and only returns the HTML decoded content string.
private string getPureContent(string str)
{
string lPattern = ">(?:(?<c>[^<]+))";
Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
StringBuilder sb = new StringBuilder();
Match mt;
for (mt = rx.Match(str); mt.Success; mt = mt.NextMatch())
{
sb.Append(HttpUtility.HtmlDecode(mt.Groups["c"].ToString()));
sb.Append(" ");
}
return sb.ToString();
}
// this method uses the regular expression to match the pattern that represent the
// HTML Title tag of the page. It only returns the first match, and ignores the rest.
private string getPageTitle(string str)
{
string lTitle = "";
string lPattern = "(?:<\\s*title\\s*>(?<t>[^<]+))";
Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
Match mt = rx.Match(str);
if (mt.Success)
try
{
lTitle = mt.Groups["t"].Value.ToString();
}
catch
{
lTitle = "";
}
else
lTitle = "";
return lTitle;
}
}
using System;
using System.IO;
using System.Net;
using System.Web;
using System.Web.UI;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Threading;
public class SearchEngine : Page
{
// private member fields.
private ArrayList _pages;
private TimeSpan _timeSpent;
/// <summary>
/// Returns an ArrayList of WebPage objects,
/// which contains the search results information.
/// </summary>
public ArrayList SearchResults
{
get { return _pages; }
}
/// <summary>
/// A TimeSpan object. It lets us know how long was the entire search.
/// </summary>
public TimeSpan timeSpent
{
get { return _timeSpent; }
}
/// <summary>
/// Start searching the web sites.
/// </summary>
/// <param name="keyword">The keyword to search for.</param>
/// <param name="pURLs">List of URLs, seperated by the \n character.</param>
/// <returns></returns>
public bool SearchWebSites(string keyword, string pURLs)
{
// start the timer
DateTime lStarted = DateTime.Now;
_pages = new ArrayList();
// split the urls string to an array
string[] lURLs = pURLs.Split('\n');
int lIdx;
WebPage wp;
// create the Thread array
Thread[] t = new Thread[lURLs.Length];
for (lIdx = 0; lIdx < lURLs.Length; lIdx++)
{
// create a WebPage object for each url
wp = new WebPage(keyword, lURLs[lIdx]);
// add it to the _pages ArrayList
_pages.Add(wp);
// pass the search() method of the new WebPage object
// to the ThreadStart object. Then pass the ThreadStart
// object to the Thread object.
t[lIdx] = new Thread(new ThreadStart(wp.search));
// start the Thread object, which executes the search().
t[lIdx].Start();
}
for (lIdx = 0; lIdx < _pages.Count; lIdx++)
{
// waiting for all the Threads to finish.
t[lIdx].Join();
}
// stop the timer.
_timeSpent = DateTime.Now.Subtract(lStarted);
return true;
}
}
/// <summary>
/// The class that contains information for each searched web page.
/// </summary>
public class WebPage
{
// private member fields.
private int _instanceCount;
private string _pageURL;
private string _pageTitle;
private string _keyword;
private TimeSpan _timeSpent;
/// <summary>
/// A TimeSpan object. It lets us know how long was the page search.
/// </summary>
public TimeSpan timeSpent
{
get { return _timeSpent; }
}
/// <summary>
/// How many times the search keyword appears on the page.
/// </summary>
public int instanceCount
{
get { return _instanceCount; }
}
/// <summary>
/// The URL of the search page
/// </summary>
public string pageURL
{
get { return _pageURL; }
}
/// <summary>
/// The title of the search page
/// </summary>
public string pageTitle
{
get { return _pageTitle; }
}
public WebPage() { }
/// <summary>
/// A parameterized constructor of the WebPage class.
/// </summary>
/// <param name="keyword">The keyword to search for.</param>
/// <param name="pageURL">The URL to connect to.</param>
public WebPage(string keyword, string pageURL)
{
_keyword = keyword;
_pageURL = pageURL;
}
/// <summary>
/// This method connects to the searching page, and retrieve the page content.
/// It then passes the content to various private methods to perform other operations.
/// </summary>
public void search()
{
// start timing it
DateTime lStarted = DateTime.Now;
// create the WebRequest
WebRequest webreq = WebRequest.Create(_pageURL);
// connect to the page, and get its response
WebResponse webresp = webreq.GetResponse();
// wrap the response stream to a stream reader
StreamReader sr = new StreamReader(webresp.GetResponseStream(), Encoding.ASCII);
StringBuilder sb = new StringBuilder();
string line;
while ((line = sr.ReadLine()) != null)
{
// append each line the server sends, to the string builder
sb.Append(line);
}
sr.Close();
string pageCode = sb.ToString();
// get the page title
_pageTitle = getPageTitle(pageCode);
// get the amount of time the keyword appeared on the page
_instanceCount = countInstance(getPureContent(pageCode));
// stop the timer
_timeSpent = DateTime.Now.Subtract(lStarted);
}
// this method uses the regular expression to match the keyword.
// it then count the matches to find out how many times the keyword appeared on the page.
private int countInstance(string str)
{
string lPattern = "(" + _keyword + ")";
int count = 0;
Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
StringBuilder sb = new StringBuilder();
Match mt;
for (mt = rx.Match(str); mt.Success; mt = mt.NextMatch())
count++;
return count;
}
// this method uses the regular expression to match the pattern that represent all
// string enclosed between ">" and "<". It removes all the HTML tags,
// and only returns the HTML decoded content string.
private string getPureContent(string str)
{
string lPattern = ">(?:(?<c>[^<]+))";
Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
StringBuilder sb = new StringBuilder();
Match mt;
for (mt = rx.Match(str); mt.Success; mt = mt.NextMatch())
{
sb.Append(HttpUtility.HtmlDecode(mt.Groups["c"].ToString()));
sb.Append(" ");
}
return sb.ToString();
}
// this method uses the regular expression to match the pattern that represent the
// HTML Title tag of the page. It only returns the first match, and ignores the rest.
private string getPageTitle(string str)
{
string lTitle = "";
string lPattern = "(?:<\\s*title\\s*>(?<t>[^<]+))";
Regex rx = new Regex(lPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
Match mt = rx.Match(str);
if (mt.Success)
try
{
lTitle = mt.Groups["t"].Value.ToString();
}
catch
{
lTitle = "";
}
else
lTitle = "";
return lTitle;
}
}
再推荐两篇文章:
http://www.codeproject.com/KB/applications/SearchDotnet.aspx (Internal Site Search Engine 测试通过)
http://www.codeproject.com/KB/IP/Searcharoo_4.aspx (C# search engine: refactored to search Word, PDF and more)
测试通过,不过有点小麻烦,得学会序列化和反序列化(这里采用二进制)。 解决问题方法是:
1. 随便找4个文件,分别命名为:plaintext.txt,Kilimanjaro.pdf,Decorator.ppt,Marathoning.doc 放在目录 content 中;
2. 根据错误提示(e文,注意e文提示上面的链接,那个链接告诉你如何序列化和反序列化二进制文件以及XML文件),创建二进制文件 z_searcharoo.dat 放在特定的目录下,然后RUN!