c#抓取网页分析
抓取网页,分析网页内容,进行处理获取信息。
例子:
抓km169上的adsl用户的费用信息,分析存储到本地数据库。
步骤:1、抓取。2、分析。3、存储。
| ||
1抓取 public string GetPage(string url, string postData, out string err) { err = ""; Stream outstream = null; Stream instream = null; StreamReader sr = null; HttpWebResponse response = null; HttpWebRequest request = null; Encoding encoding = Encoding.Default; byte[] data = encoding.GetBytes(postData); // 准备请求... try { // 设置参数 request = WebRequest.Create(url) as HttpWebRequest; CookieContainer cookieContainer = new CookieContainer(); request.CookieContainer = cookieContainer; request.AllowAutoRedirect = true; request.Method = "POST"; request.ContentType = "application/x-www-form-urlencoded"; request.ContentLength = data.Length; outstream = request.GetRequestStream(); outstream.Write(data, 0, data.Length); outstream.Close(); //发送请求并获取相应回应数据 response = request.GetResponse() as HttpWebResponse; //直到request.GetResponse()程序才开始向目标网页发送Post请求 instream = response.GetResponseStream(); sr = new StreamReader(instream, encoding); //返回结果网页(html)代码 string content = sr.ReadToEnd(); err = string.Empty; return content; } catch (Exception ex) { err = ex.Message; return string.Empty; } } [[i] Last edited by 王暴徒 on 2006-2-13 at 13:49 [/i]] |
| ||
2、分析 public string Get() { string str = GetPage(KMADSLURL, strReq, out err); Regex rgx = new Regex("table_det\\(\\n*\\s*\"([^\"]*)\",\\n*\\s*\"([^\"]*)\",\\n*\\s*\"([^\"]*)\",\\n*\\s*\"([^\"]*)\",\\n*\\s*\"([^\"]*)\",\\n*\\s*\"([^\"]*)\",\\n*\\s*\"([^\"]*)\",\\n*\\s*\"([^\"]*)\",\\n*\\s*\"([^\"]*)\",\\n*\\s*\"([^\"]*)\",\\n*\\s*\"([^\"]*)\"\\);", RegexOptions.Singleline); foreach (Match m in rgx.Matches(str)) { Rec r = new Rec(); r.str1 = m.Groups[1].Value; r.Save(); } return null; } 此处的关键在于正则表达式,利用匹配关系获得一条条记录,再用%1~%9分组,得到每个字段的内容,最后生成相应的记录即可(拼sql也可),这里用了个持久化的咚咚,下次详细说。 正则技巧:用^(间隔符号)来划分字段,:)不大好解释,大家自己体会下吧。 [[i] Last edited by 王暴徒 on 2006-2-13 at 13:58 [/i]] |
| ||||||||||||||||||||||||
我以前写了个多线程批量下载歌曲的程序,当时程序考虑的是挂接百渡,同时又预留了扩展性,比如通过配置也可以获取雅虎的歌曲,这就好考虑到个网站网页的编码方式,和暴徒的一样,我也是用了HttpWebResponse 类.通过对各种编码的网页在2进制下面的观察,发现前2个字节不同,所以转换成STRING时候需要特殊处理,否则中文有乱码
|
| ||
namespace Mp3Crazy { using System; /// <summary> /// 记录下载的字节位置 /// </summary> public class DownLoadState { private string _FileName; private string _AttachmentName; private int _Position; private string _RequestURL; private string _ResponseURL; private int _Length; private byte[] _Data; public string FileName { get { return _FileName; } } public int Position { get { return _Position; } } public int Length { get { return _Length; } } public string AttachmentName { get { return _AttachmentName; } } public string RequestURL { get { return _RequestURL; } } public string ResponseURL { get { return _ResponseURL; } } public byte[] Data { get { return _Data; } } internal DownLoadState(string RequestURL, string ResponseURL, string FileName, string AttachmentName, int Position, int Length, byte[] Data) { this._FileName = FileName; this._RequestURL = RequestURL; this._ResponseURL = ResponseURL; this._AttachmentName = AttachmentName; this._Position = Position; this._Data = Data; this._Length = Length; } internal DownLoadState(string RequestURL, string ResponseURL, string FileName, string AttachmentName, int Position, int Length, ThreadCallbackHandler tch) { this._RequestURL = RequestURL; this._ResponseURL = ResponseURL; this._FileName = FileName; this._AttachmentName = AttachmentName; this._Position = Position; this._Length = Length; this._ThreadCallback = tch; } internal DownLoadState(string RequestURL, string ResponseURL, string FileName, string AttachmentName, int Position, int Length) { this._RequestURL = RequestURL; this._ResponseURL = ResponseURL; this._FileName = FileName; this._AttachmentName = AttachmentName; this._Position = Position; this._Length = Length; } private ThreadCallbackHandler _ThreadCallback; public HttpWebClient httpWebClient { get { return this._hwc; } set { this._hwc = value; } } private HttpWebClient _hwc; internal void StartDownloadFileChunk() { if (this._ThreadCallback != null) { this._ThreadCallback(this._RequestURL, this._FileName, this._Position, this._Length); this._hwc.OnThreadProcess(""); } } } } |
| ||
/* .Net/C#: 实现支持断点续传多线程下载的工具类 * Reflector 了一下 System.Net.WebClient ,改写或增加了若干: * DownLoad、Upload 相关方法! * 增加了 DataReceive、ExceptionOccurrs 事件 */ namespace Mp3Crazy { using System; using System.IO; using System.Net; using System.Text; using System.Security; using System.Threading; using System.Collections.Specialized; //委托代理线程的所执行的方法签名一致 public delegate void ThreadCallbackHandler(string S, string s, int I, int i); //异常处理动作 public enum ExceptionActions { Throw, CancelAll, Ignore, Retry } /// <summary> /// 支持断点续传多线程下载的类 /// </summary> public class HttpWebClient { public delegate void ExceptionEventHandler(HttpWebClient Sender, ExceptionEventArgs e); public event ExceptionEventHandler ExceptionOccurrs; //发生异常事件 public delegate void ThreadProcessEventHandler(HttpWebClient Sender, ThreadProcessEventArgs e); public event ThreadProcessEventHandler ThreadProcessEnd; //发生多线程处理完毕事件 private int _FileLength,_getLength; //下载文件的总大小 public int TimeOut=20000; public int SongID=0; public bool UrlParsed; public string FileName; public bool Free=true; public int RetryTimes; public int TBlocks=1,curBlock; public int FileLength { get { return _FileLength; } } public int GetLength { get { return _getLength; } } [[i] Last edited by Timothy on 2006-2-17 at 09:48 [/i]] |
| ||
/// <summary> /// 分块下载文件 /// </summary> /// <param name="Address">URL 地址</param> /// <param name="FileName">保存到本地的路径文件名</param> /// <param name="ChunksCount">块数,线程数</param> public void DownloadFile(string Address, string FileName, int ChunksCount) { int p = 0; // position int s = 0; // chunk size _getLength=0; string a = null; HttpWebRequest hwrq; HttpWebResponse hwrp = null; try { hwrq = (HttpWebRequest) WebRequest.Create(this.GetUri(Address)); hwrq.Timeout=TimeOut; hwrp = (HttpWebResponse) hwrq.GetResponse(); //hwrq=null; long L = hwrp.ContentLength; hwrq.Credentials = this.m_credentials; L = ((L == -1) || (L > 0x7fffffff)) ? ((long) 0x7fffffff) : L; //Int32.MaxValue 该常数的值为 2,147,483,647; 即十六进制的 0x7FFFFFFF int l = (int) L; this._FileLength = l; bool b = true;//(hwrp.Headers["Accept-Ranges"] != null && hwrp.Headers["Accept-Ranges"] == "bytes"); a = hwrp.Headers["Content-Disposition"]; //attachment if (a != null) { a = a.Substring(a.LastIndexOf("filename=") + 9); } else { a = FileName; } int ss = s; if (b) { s = l / ChunksCount; if (s < 2 * 64 * 1024) //块大小至少为 128 K 字节 { s = 2 * 64 * 1024; } ss = s; int i = 0; while (l >= s) { l -= s; if (l < s) { s += l; } if (i++ > 0) { DownLoadState x = new DownLoadState(Address, hwrp.ResponseUri.AbsolutePath, FileName, a, p, s, new ThreadCallbackHandler(this.DownloadFileChunk)); // 单线程下载 // x.StartDownloadFileChunk(); x.httpWebClient = this; //多线程下载 Thread t = new Thread(new ThreadStart(x.StartDownloadFileChunk)); //this.OnThreadProcess(t); t.Start(); } p += s; } s = ss; this.ResponseAsBytes(Address, hwrp, s, FileName); this.OnThreadProcess(""); } } catch (Exception e) { if (this.ExceptionOccurrs != null) { string path=""; if(hwrp!=null) path=hwrp.ResponseUri.AbsolutePath; DownLoadState x = new DownLoadState(Address,path, FileName, a, p, s); ExceptionEventArgs eea = new ExceptionEventArgs(e, x); ExceptionOccurrs(this, eea); } } } internal void OnThreadProcess(string id) { if (ThreadProcessEnd != null) { ThreadProcessEventArgs tpea = new ThreadProcessEventArgs(id); ThreadProcessEnd(this, tpea); } } /// <summary> /// 下载一个文件块,利用该方法可自行实现多线程断点续传 /// </summary> /// <param name="Address">URL 地址</param> /// <param name="FileName">保存到本地的路径文件名</param> /// <param name="Length">块大小</param> public void DownloadFileChunk(string Address, string FileName, int FromPosition, int Length) { HttpWebResponse hwrp = null; string a = null; try { //this._FileName = FileName; HttpWebRequest hwrq = (HttpWebRequest) WebRequest.Create(this.GetUri(Address)); //hwrq.Credentials = this.m_credentials; hwrq.AddRange(FromPosition); hwrp = (HttpWebResponse) hwrq.GetResponse(); hwrq=null; a = hwrp.Headers["Content-Disposition"]; //attachment if (a != null) { a = a.Substring(a.LastIndexOf("filename=") + 9); } else { a = FileName; } this.ResponseAsBytes(Address, hwrp, Length, FileName); } catch (Exception e) { if (this.ExceptionOccurrs != null) { DownLoadState x = new DownLoadState(Address, hwrp.ResponseUri.AbsolutePath, FileName, a, FromPosition, Length); ExceptionEventArgs eea = new ExceptionEventArgs(e, x); ExceptionOccurrs(this, eea); } } } internal void ResponseAsBytes(string RequestURL, WebResponse Response, long Length, string FileName) { string a = null; //AttachmentName int P = 0; //整个文件的位置指针 int num2 = 0; try { a = Response.Headers["Content-Disposition"]; //attachment if (a != null) { a = a.Substring(a.LastIndexOf("filename=") + 9); } int p = 0; //本块的位置指针 int num1=(int)Length; byte[] buffer1 = new byte[30000]; string s = Response.Headers["Content-Range"]; if (s != null) { s = s.Replace("bytes ", ""); s = s.Substring(0, s.IndexOf("-")); P = Convert.ToInt32(s); } Stream S = Response.GetResponseStream(); System.IO.FileStream sw = new System.IO.FileStream(FileName, System.IO.FileMode.OpenOrCreate, System.IO.FileAccess.ReadWrite, System.IO.FileShare.ReadWrite); //Console.WriteLine("P:{0}",P); do { num2 = S.Read(buffer1, 0,30000); if (num2 > 0) { sw.Position = P; sw.Write(buffer1,0,num2); p += num2; //本块的位置指针 P += num2; //整个文件的位置指针 _getLength+=num2; //Console.WriteLine("{0}",(_getLength*100/_FileLength)); } else { break; } } while (num1>p); sw.Close(); S.Close(); buffer1=null; sw=null; S = null; Response = null; } catch (Exception e) { if (this.ExceptionOccurrs != null) { DownLoadState x = new DownLoadState(RequestURL, Response.ResponseUri.AbsolutePath, FileName, a, P, num2); ExceptionEventArgs eea = new ExceptionEventArgs(e, x); ExceptionOccurrs(this, eea); } } } private byte[] ResponseAsBytes(WebResponse response) { int num2; long num1 = response.ContentLength; bool flag1 = false; if (num1 == -1) { flag1 = true; num1 = 0x10000; } byte[] buffer1 = new byte[(int) num1]; Stream stream1 = response.GetResponseStream(); int num3 = 0; do { num2 = stream1.Read(buffer1, num3, ((int) num1) - num3); num3 += num2; if (flag1 && (num3 == num1)) { num1 += 0x10000; byte[] buffer2 = new byte[(int) num1]; Buffer.BlockCopy(buffer1, 0, buffer2, 0, num3); buffer1 = buffer2; } } while (num2 != 0); stream1.Close(); if (flag1) { byte[] buffer3 = new byte[num3]; Buffer.BlockCopy(buffer1, 0, buffer3, 0, num3); buffer1 = buffer3; } return buffer1; } private NameValueCollection m_requestParameters; private Uri m_baseAddress; private ICredentials m_credentials = CredentialCache.DefaultCredentials; public ICredentials Credentials { get { return this.m_credentials; } set { this.m_credentials = value; } } public NameValueCollection QueryString { get { if (this.m_requestParameters == null) { this.m_requestParameters = new NameValueCollection(); } return this.m_requestParameters; } set { this.m_requestParameters = value; } } public string BaseAddress { get { if (this.m_baseAddress != null) { return this.m_baseAddress.ToString(); } return string.Empty; } set { if ((value == null) || (value.Length == 0)) { this.m_baseAddress = null; } else { try { this.m_baseAddress = new Uri(value); } catch (Exception exception1) { throw new ArgumentException("value", exception1); } } } } private Uri GetUri(string path) { Uri uri1; try { if (this.m_baseAddress != null) { uri1 = new Uri(this.m_baseAddress, path); } else { uri1 = new Uri(path); } if (this.m_requestParameters == null) { return uri1; } StringBuilder builder1 = new StringBuilder(); string text1 = string.Empty; for (int num1 = 0; num1 < this.m_requestParameters.Count; num1++) { builder1.Append(text1 + this.m_requestParameters.AllKeys[num1] + "=" + this.m_requestParameters[num1]); text1 = "&"; } UriBuilder builder2 = new UriBuilder(uri1); builder2.Query = builder1.ToString(); uri1 = builder2.Uri; } catch (UriFormatException) { uri1 = new Uri(Path.GetFullPath(path)); } return uri1; } } } |
| ||
这个是如何使用的一个simple demo namespace MultiThread { using System; using System.IO; using System.Net; using System.Text; using System.Security; using System.Threading; using System.Collections; using System.Collections.Specialized; using Mp3Crazy; /// <summary> /// 测试类 /// </summary> class AppTest { int _k = 0; int _K = 0; static void Main() { //FileStream fs=File. //fs. //System.Text.Encoding.UTF8 // int i; // DBAccess.InitConn(); // for(i=0;i<26;i++) // { // DBAccess.ExcuteNoquery("insert into Node(Name,FID,Url,IsSinger) values('"+(char)(i+65)+"',10,'http://list.mp3.baidu.com/song/"+(char)(i+65)+".htm',0)"); // } // DBAccess.DestroyConn(); // return; go(); GC.Collect(); string str=System.Console.ReadLine(); //for(int j=0;;); // string uploadfile = "e:\\test_local.rar"; // string str = x.UploadFileEx("http://localhost/phpmyadmin/uploadaction.php", "POST", uploadfile, "file1"); // System.Console.WriteLine(str); // System.Console.ReadLine(); } private static void go() { int i=0; for(i=0;i<3;i++) { AppTest a = new AppTest(); HttpWebClient x = new HttpWebClient(); x.TBlocks=2; x.curBlock=0; a._K =2; x.TimeOut=10000; //订阅 DataReceive 事件 //x.DataReceive += new Microshaoft.Utils.HttpWebClient.DataReceiveEventHandler(a.x_DataReceive); //订阅 ExceptionOccurrs 事件 x.ExceptionOccurrs += new HttpWebClient.ExceptionEventHandler(a.x_ExceptionOccurrs); x.ThreadProcessEnd += new HttpWebClient.ThreadProcessEventHandler(a.x_ThreadProcessEnd); string F = "http://localhost/gsx.MP3"; a._F = F; string f = F.Substring(F.LastIndexOf("/") + 1)+i.ToString(); a._f =f; //(new System.Threading.Thread(new System.Threading.ThreadStart(new ThreadProcessState(F, @"E:\temp\" + f, 10, x).StartThreadProcess))).Start(); x.DownloadFile(F, @"E:\" + f, a._K); } } string _F; string _f; private void x_ExceptionOccurrs(HttpWebClient Sender, ExceptionEventArgs e) { System.Console.WriteLine(e.Exception.Message); //发生异常重新下载相当于断点续传,你可以自己自行选择处理方式 HttpWebClient x = new HttpWebClient(); x.DownloadFileChunk(this._F, this._f, e.DownloadState.Position, e.DownloadState.Length); e.ExceptionAction = ExceptionActions.Ignore; } private void x_ThreadProcessEnd(HttpWebClient Sender, ThreadProcessEventArgs e) { //if (e.thread.ThreadState == System.Threading.ThreadState.Stopped) //if (this._k ++ == this._K - 1) if(Sender.curBlock++==Sender.TBlocks-1) System.Console.WriteLine("end\n"); GC.Collect(); } } class Test { public static void Main2() { // Create a new 'HttpWebRequest' Object to the mentioned URL. HttpWebRequest myHttpWebRequest=(HttpWebRequest)WebRequest.Create("http://localhost/default.aspx"); myHttpWebRequest.AddRange(50,100); // Assign the response object of 'HttpWebRequest' to a 'HttpWebResponse' variable. HttpWebResponse myHttpWebResponse=(HttpWebResponse)myHttpWebRequest.GetResponse(); bool b=(myHttpWebRequest.Headers["Range"] != null && myHttpWebRequest.Headers["Range"].ToLower().StartsWith( "bytes")); Console.WriteLine("\nThe HttpHeaders are \n\n\tName\t\tValue\n{0}",myHttpWebRequest.Headers); IEnumerator ie=myHttpWebRequest.Headers.GetEnumerator(); while(ie.MoveNext()) Console.WriteLine("key:{0}",ie.Current.ToString()); // Print the HTML contents of the page to the console. long i=myHttpWebResponse.ContentLength; Stream streamResponse=myHttpWebResponse.GetResponseStream(); StreamReader streamRead = new StreamReader( streamResponse ); Char[] readBuff = new Char[256]; int count = streamRead.Read( readBuff, 0, 256 ); Console.WriteLine("\nThe HTML contents of page the are : \n\n "); while (count > 0) { String outputData = new String(readBuff, 0, count); Console.Write(outputData); count = streamRead.Read(readBuff, 0, 256); } // Close the Stream object. streamResponse.Close(); streamRead.Close(); // Release the HttpWebResponse Resource. myHttpWebResponse.Close(); Console.ReadLine(); } } } |