由于最近网站内容需要更新的还是满多的,于是想开发一个采集系统。收集了一下资料。
1<%@ Page language="C#" Trace="True" %>
2<%@ Import Namespace="System.Net" %>
3<%@ Import Namespace="System.IO" %>
4
5<html>
6<head>
7
8
9<SCRIPT runat="server">
10 void Page_Load(Object sender, EventArgs e) {
11
12 WebRequest req = WebRequest.Create("http://www.im286.com/index.php");
13string urliii="src="; //图片使用绝对连接
14urliii += "http://www.im286.com/";
15 try {
16 WebResponse result = req.GetResponse();
17 Stream ReceiveStream = result.GetResponseStream();
18
19 Byte[] read = new Byte[512];
20 int bytes = ReceiveStream.Read(read, 0, 512);
21
22 lblHTML.Text = "";
23 while (bytes > 0)
24 {
25
26 // 注意:
27 // 下面假定响应使用 gb2312 作为编码方式。
28 // 如果内容以 ANSI 代码页形式(例如,932)发送,则使用类似下面的语句:
29 //Encoding encode = System.Text.Encoding.GetEncoding("shift-jis");
30 Encoding encode = System.Text.Encoding.GetEncoding("gb2312");
31 lblHTML.Text = lblHTML.Text + encode.GetString(read, 0, bytes);
32
33 bytes = ReceiveStream.Read(read, 0, 512);
34
35 }
36 } catch(Exception) {
37 lblHTML.Text = "检索页时出错";
38 }
39lblHTML.Text = lblHTML.Text.Replace("src=",""+urliii+"");
40lblHTML.Text = lblHTML.Text.Replace("\"","");
41 }
42</SCRIPT>
43</head>
44
45<body>
46
47 <form method="post" runat="server">
48
49 <asp:Label runat=server ID="lblHTML" Rows="30" Cols="80" EnableViewState="false" Wrap="True"></asp:Label>
50 </form>
51
52</body>
53</html>
54
2<%@ Import Namespace="System.Net" %>
3<%@ Import Namespace="System.IO" %>
4
5<html>
6<head>
7
8
9<SCRIPT runat="server">
10 void Page_Load(Object sender, EventArgs e) {
11
12 WebRequest req = WebRequest.Create("http://www.im286.com/index.php");
13string urliii="src="; //图片使用绝对连接
14urliii += "http://www.im286.com/";
15 try {
16 WebResponse result = req.GetResponse();
17 Stream ReceiveStream = result.GetResponseStream();
18
19 Byte[] read = new Byte[512];
20 int bytes = ReceiveStream.Read(read, 0, 512);
21
22 lblHTML.Text = "";
23 while (bytes > 0)
24 {
25
26 // 注意:
27 // 下面假定响应使用 gb2312 作为编码方式。
28 // 如果内容以 ANSI 代码页形式(例如,932)发送,则使用类似下面的语句:
29 //Encoding encode = System.Text.Encoding.GetEncoding("shift-jis");
30 Encoding encode = System.Text.Encoding.GetEncoding("gb2312");
31 lblHTML.Text = lblHTML.Text + encode.GetString(read, 0, bytes);
32
33 bytes = ReceiveStream.Read(read, 0, 512);
34
35 }
36 } catch(Exception) {
37 lblHTML.Text = "检索页时出错";
38 }
39lblHTML.Text = lblHTML.Text.Replace("src=",""+urliii+"");
40lblHTML.Text = lblHTML.Text.Replace("\"","");
41 }
42</SCRIPT>
43</head>
44
45<body>
46
47 <form method="post" runat="server">
48
49 <asp:Label runat=server ID="lblHTML" Rows="30" Cols="80" EnableViewState="false" Wrap="True"></asp:Label>
50 </form>
51
52</body>
53</html>
54
1using System;
2using System.Collections;
3using System.ComponentModel;
4using System.Data;
5using System.Drawing;
6using System.Web;
7using System.Web.SessionState;
8using System.Web.UI;
9using System.Web.UI.WebControls;
10using System.Web.UI.HtmlControls;
11using System.Text;
12using System.IO;
13using System.Net;
14
15namespace myclass.test
16{
17/// <summary>
18/// getPageHtml 的摘要说明。
19/// </summary>
20public class getPageHtml : System.Web.UI.Page
21{
22private void Page_Load(object sender, System.EventArgs e)
23{
24// 在此处放置用户代码以初始化页面
25if(WriteFile(getUrltoHtml("http://www.sina.com"),"E:\\net_test\\test\\sina.htm"))
26{
27Response.Write("ok");
28}
29string a="";
30a=a.ToUpper().Replace(",","");
31}
32
33Web 窗体设计器生成的代码
53public string getUrltoHtml(string Url)
54{
55string str = string.Empty;
56
57try
58{
59System.Text.Encoding code = System.Text.Encoding.GetEncoding("gb2312");
60WebRequest wReq = WebRequest.Create(Url);
61WebResponse wResp = wReq.GetResponse();
62Stream respStream = wResp.GetResponseStream();
63StreamReader reader = new StreamReader(respStream, code);
64str = reader.ReadToEnd();
65}
66catch(Exception ex)
67{
68throw new Exception( "" + ex.Message );
69}
70return str;
71}
72
73//写文件
74public bool WriteFile(string str, string OutFile )
75{
76
77StreamWriter sw = null;
78
79
80string Head = string.Empty;
81
82
83// 写文件
84try
85{
86if ( str.Length > 0 )
87{
88System.Text.Encoding code = System.Text.Encoding.GetEncoding("gb2312");
89sw = new StreamWriter(OutFile , false, code);
90sw.Write( str);
91sw.Flush();
92}
93}
94catch
95{
96
97}
98finally
99{
100if ( sw != null )
101sw.Close();
102}
103return true;
104}
105
106}
107}
108
2using System.Collections;
3using System.ComponentModel;
4using System.Data;
5using System.Drawing;
6using System.Web;
7using System.Web.SessionState;
8using System.Web.UI;
9using System.Web.UI.WebControls;
10using System.Web.UI.HtmlControls;
11using System.Text;
12using System.IO;
13using System.Net;
14
15namespace myclass.test
16{
17/// <summary>
18/// getPageHtml 的摘要说明。
19/// </summary>
20public class getPageHtml : System.Web.UI.Page
21{
22private void Page_Load(object sender, System.EventArgs e)
23{
24// 在此处放置用户代码以初始化页面
25if(WriteFile(getUrltoHtml("http://www.sina.com"),"E:\\net_test\\test\\sina.htm"))
26{
27Response.Write("ok");
28}
29string a="";
30a=a.ToUpper().Replace(",","");
31}
32
33Web 窗体设计器生成的代码
53public string getUrltoHtml(string Url)
54{
55string str = string.Empty;
56
57try
58{
59System.Text.Encoding code = System.Text.Encoding.GetEncoding("gb2312");
60WebRequest wReq = WebRequest.Create(Url);
61WebResponse wResp = wReq.GetResponse();
62Stream respStream = wResp.GetResponseStream();
63StreamReader reader = new StreamReader(respStream, code);
64str = reader.ReadToEnd();
65}
66catch(Exception ex)
67{
68throw new Exception( "" + ex.Message );
69}
70return str;
71}
72
73//写文件
74public bool WriteFile(string str, string OutFile )
75{
76
77StreamWriter sw = null;
78
79
80string Head = string.Empty;
81
82
83// 写文件
84try
85{
86if ( str.Length > 0 )
87{
88System.Text.Encoding code = System.Text.Encoding.GetEncoding("gb2312");
89sw = new StreamWriter(OutFile , false, code);
90sw.Write( str);
91sw.Flush();
92}
93}
94catch
95{
96
97}
98finally
99{
100if ( sw != null )
101sw.Close();
102}
103return true;
104}
105
106}
107}
108
1用这个方法提取,两个参数,start_string是搜索开始的标识,end_string是搜索结束的标识
2在程序中,这两个参数最好是英文字母,如果是汉字的话就需要转换一下,比如:
3byte[] startCN = System.Text.Encoding.Default.GetBytes("这里写开始标记");
4string startUTF8 = System.Text.Encoding.UTF8.GetString(startCN);
5
6
7public string Get_Data(string start_string,string end_string)
8{
9WebRequest wreq = WebRequest.Create("这里是网址");
10HttpWebResponse wresp = (HttpWebResponse)wreq.GetResponse();
11string HTML = "";
12Stream s = wresp.GetResponseStream();
13StreamReader objReader = new StreamReader(s,System.Text.Encoding.GetEncoding("GB2312"));
14string sLine = "";
15int i = 0;
16while (sLine!=null)
17{
18i++;
19sLine = objReader.ReadLine();
20if (sLine!=null)
21HTML += sLine;
22}
23String temp= "";
24int start,stop;
25start = HTML.IndexOf(start_string,0,HTML.Length);
26stop = HTML.IndexOf(end_string,0,HTML.Length);
27temp = HTML.Substring(start,stop-start);
28return temp;
29}
30
2在程序中,这两个参数最好是英文字母,如果是汉字的话就需要转换一下,比如:
3byte[] startCN = System.Text.Encoding.Default.GetBytes("这里写开始标记");
4string startUTF8 = System.Text.Encoding.UTF8.GetString(startCN);
5
6
7public string Get_Data(string start_string,string end_string)
8{
9WebRequest wreq = WebRequest.Create("这里是网址");
10HttpWebResponse wresp = (HttpWebResponse)wreq.GetResponse();
11string HTML = "";
12Stream s = wresp.GetResponseStream();
13StreamReader objReader = new StreamReader(s,System.Text.Encoding.GetEncoding("GB2312"));
14string sLine = "";
15int i = 0;
16while (sLine!=null)
17{
18i++;
19sLine = objReader.ReadLine();
20if (sLine!=null)
21HTML += sLine;
22}
23String temp= "";
24int start,stop;
25start = HTML.IndexOf(start_string,0,HTML.Length);
26stop = HTML.IndexOf(end_string,0,HTML.Length);
27temp = HTML.Substring(start,stop-start);
28return temp;
29}
30
1string PageUrl = string.Format("http://pachong.cn");
2WebClient wc = new WebClient();
3wc.Credentials = CredentialCache.DefaultCredentials;
4Byte[] pageData = wc.DownloadData(PageUrl);
5string result = Encoding.Default.GetString(pageData);
6wc.Dispose();
2WebClient wc = new WebClient();
3wc.Credentials = CredentialCache.DefaultCredentials;
4Byte[] pageData = wc.DownloadData(PageUrl);
5string result = Encoding.Default.GetString(pageData);
6wc.Dispose();
1GetPageHTML.aspx
2<%@ Page language="c#" validateRequest = "false" Codebehind="GetPageHtml.aspx.cs"
3 AutoEventWireup="false" Inherits="eMeng.Exam.GetPageHtml" %>
4<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" >
5<HTML>
6 <HEAD>
7 <title>得到网页源代码</title>
8 <meta name="GENERATOR" Content="Microsoft Visual Studio 7.0">
9 <meta name="CODE_LANGUAGE" Content="C#">
10 <meta name="vs_defaultClientScript" content="JavaScript">
11 <meta name="vs_targetSchema" content="http://schemas.microsoft.com/intellisense/ie5">
12 </HEAD>
13 <body MS_POSITIONING="GridLayout">
14 <form id="aspNetBuffer" method="post" runat="server">
15 <div align="center" style="FONT-WEIGHT: bold">得到任意网页源代码</div>
16 <asp:TextBox id="UrlText" runat="server" Width="400px">http://dotnet.aspx.cc/content.aspx
17 </asp:TextBox>
18 <asp:Button id="WebClientButton" Runat="server" Text="用WebClient得到"></asp:Button>
19 <asp:Button id="WebRequestButton" runat="server" Text="用WebRequest得到"></asp:Button>
20 <br>
21 <asp:TextBox id="ContentHtml" runat="server" Width="100%" Height="360px" TextMode="MultiLine">
22 </asp:TextBox>
23 </form>
24 </body>
25</HTML>
26
27
2<%@ Page language="c#" validateRequest = "false" Codebehind="GetPageHtml.aspx.cs"
3 AutoEventWireup="false" Inherits="eMeng.Exam.GetPageHtml" %>
4<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" >
5<HTML>
6 <HEAD>
7 <title>得到网页源代码</title>
8 <meta name="GENERATOR" Content="Microsoft Visual Studio 7.0">
9 <meta name="CODE_LANGUAGE" Content="C#">
10 <meta name="vs_defaultClientScript" content="JavaScript">
11 <meta name="vs_targetSchema" content="http://schemas.microsoft.com/intellisense/ie5">
12 </HEAD>
13 <body MS_POSITIONING="GridLayout">
14 <form id="aspNetBuffer" method="post" runat="server">
15 <div align="center" style="FONT-WEIGHT: bold">得到任意网页源代码</div>
16 <asp:TextBox id="UrlText" runat="server" Width="400px">http://dotnet.aspx.cc/content.aspx
17 </asp:TextBox>
18 <asp:Button id="WebClientButton" Runat="server" Text="用WebClient得到"></asp:Button>
19 <asp:Button id="WebRequestButton" runat="server" Text="用WebRequest得到"></asp:Button>
20 <br>
21 <asp:TextBox id="ContentHtml" runat="server" Width="100%" Height="360px" TextMode="MultiLine">
22 </asp:TextBox>
23 </form>
24 </body>
25</HTML>
26
27
1using System;
2using System.Collections;
3using System.ComponentModel;
4using System.Data;
5using System.Drawing;
6using System.Web;
7using System.Web.SessionState;
8using System.Web.UI;
9using System.Web.UI.WebControls;
10using System.Web.UI.HtmlControls;
11using System.IO;
12using System.Net;
13using System.Text;
14using System.Text.RegularExpressions;
15namespace eMeng.Exam
16{
17/// <summary>
18/// GetPageHtml 的摘要说明。
19/// </summary>
20public class GetPageHtml : System.Web.UI.Page
21{
22protected System.Web.UI.WebControls.Button WebClientButton;
23protected System.Web.UI.WebControls.Button WebRequestButton;
24protected System.Web.UI.WebControls.TextBox ContentHtml;
25protected System.Web.UI.WebControls.TextBox UrlText;
26protected System.Web.UI.WebControls.Button GetText;
27private string PageUrl = "";
28
29private void Page_Load(object sender, System.EventArgs e)
30 {}
31
32Web Form Designer generated code
52
53private void WebClientButton_Click(object sender, System.EventArgs e)
54{
55 PageUrl = UrlText.Text;
56 WebClient wc = new WebClient();
57 wc.Credentials = CredentialCache.DefaultCredentials;
58
59 ///方法一:
60 Byte[] pageData = wc.DownloadData(PageUrl);
61 ContentHtml.Text = Encoding.Default.GetString(pageData);
62
63
64 /// 方法二:
65 /// ***************代码开始**********
66 /// Stream resStream = wc.OpenRead(PageUrl);
67 /// StreamReader sr = new StreamReader(resStream,System.Text.Encoding.Default);
68 /// ContentHtml.Text = sr.ReadToEnd();
69 /// resStream.Close();
70 /// **************代码结束********
71 ///
72wc.Dispose();
73}
74
75private void WebRequestButton_Click(object sender, System.EventArgs e)
76{
77 PageUrl = UrlText.Text;
78 WebRequest request = WebRequest.Create(PageUrl);
79 WebResponse response = request.GetResponse();
80 Stream resStream = response.GetResponseStream();
81 StreamReader sr = new StreamReader(resStream, System.Text.Encoding.Default);
82 ContentHtml.Text = sr.ReadToEnd();
83 resStream.Close();
84 sr.Close();
85}
86
87private void GetText_Click(object sender, System.EventArgs e)
88 {
89 PageUrl = UrlText.Text;
90 WebRequest request = WebRequest.Create(PageUrl);
91 WebResponse response = request.GetResponse();
92 Stream resStream = response.GetResponseStream();
93 StreamReader sr = new StreamReader(resStream, System.Text.Encoding.Default);
94 ContentHtml.Text = sr.ReadToEnd();
95 resStream.Close();
96 sr.Close();
97 ContentHtml.Text = Regex.Replace(ContentHtml.Text,"<[^>]*>", "");
98 //替换空格
99 ContentHtml.Text = Regex.Replace(ContentHtml.Text,"\\s+", " ");
100 }
101}
102}
103
104
2using System.Collections;
3using System.ComponentModel;
4using System.Data;
5using System.Drawing;
6using System.Web;
7using System.Web.SessionState;
8using System.Web.UI;
9using System.Web.UI.WebControls;
10using System.Web.UI.HtmlControls;
11using System.IO;
12using System.Net;
13using System.Text;
14using System.Text.RegularExpressions;
15namespace eMeng.Exam
16{
17/// <summary>
18/// GetPageHtml 的摘要说明。
19/// </summary>
20public class GetPageHtml : System.Web.UI.Page
21{
22protected System.Web.UI.WebControls.Button WebClientButton;
23protected System.Web.UI.WebControls.Button WebRequestButton;
24protected System.Web.UI.WebControls.TextBox ContentHtml;
25protected System.Web.UI.WebControls.TextBox UrlText;
26protected System.Web.UI.WebControls.Button GetText;
27private string PageUrl = "";
28
29private void Page_Load(object sender, System.EventArgs e)
30 {}
31
32Web Form Designer generated code
52
53private void WebClientButton_Click(object sender, System.EventArgs e)
54{
55 PageUrl = UrlText.Text;
56 WebClient wc = new WebClient();
57 wc.Credentials = CredentialCache.DefaultCredentials;
58
59 ///方法一:
60 Byte[] pageData = wc.DownloadData(PageUrl);
61 ContentHtml.Text = Encoding.Default.GetString(pageData);
62
63
64 /// 方法二:
65 /// ***************代码开始**********
66 /// Stream resStream = wc.OpenRead(PageUrl);
67 /// StreamReader sr = new StreamReader(resStream,System.Text.Encoding.Default);
68 /// ContentHtml.Text = sr.ReadToEnd();
69 /// resStream.Close();
70 /// **************代码结束********
71 ///
72wc.Dispose();
73}
74
75private void WebRequestButton_Click(object sender, System.EventArgs e)
76{
77 PageUrl = UrlText.Text;
78 WebRequest request = WebRequest.Create(PageUrl);
79 WebResponse response = request.GetResponse();
80 Stream resStream = response.GetResponseStream();
81 StreamReader sr = new StreamReader(resStream, System.Text.Encoding.Default);
82 ContentHtml.Text = sr.ReadToEnd();
83 resStream.Close();
84 sr.Close();
85}
86
87private void GetText_Click(object sender, System.EventArgs e)
88 {
89 PageUrl = UrlText.Text;
90 WebRequest request = WebRequest.Create(PageUrl);
91 WebResponse response = request.GetResponse();
92 Stream resStream = response.GetResponseStream();
93 StreamReader sr = new StreamReader(resStream, System.Text.Encoding.Default);
94 ContentHtml.Text = sr.ReadToEnd();
95 resStream.Close();
96 sr.Close();
97 ContentHtml.Text = Regex.Replace(ContentHtml.Text,"<[^>]*>", "");
98 //替换空格
99 ContentHtml.Text = Regex.Replace(ContentHtml.Text,"\\s+", " ");
100 }
101}
102}
103
104
1这里是针对一些利用 isa server proxy 上网的.
2修改下 WebRequest 方法:
3PageUrl = UrlText.Text;
4WebRequest request = WebRequest.Create(PageUrl);
5
6WebProxy myProxy=new WebProxy();
7myProxy = (WebProxy)request.Proxy;
8
9myProxy.Address = new Uri("http://代理服务器:端口");
10myProxy.Credentials = new NetworkCredential("用户名", "密码", "域名");
11request.Proxy = myProxy;
12
13WebResponse response = request.GetResponse();
2修改下 WebRequest 方法:
3PageUrl = UrlText.Text;
4WebRequest request = WebRequest.Create(PageUrl);
5
6WebProxy myProxy=new WebProxy();
7myProxy = (WebProxy)request.Proxy;
8
9myProxy.Address = new Uri("http://代理服务器:端口");
10myProxy.Credentials = new NetworkCredential("用户名", "密码", "域名");
11request.Proxy = myProxy;
12
13WebResponse response = request.GetResponse();