自己用C#写一个采集器、蜘蛛
using System; using System.Collections.Generic; using System.Text; using System.Net; using System.Web; using System.IO; using System.Collections; using System.Text.RegularExpressions; namespace chinaz { class Program { static void Main(string[] args) { string cookie = null; using (StreamReader sr = new StreamReader("cookie.txt")) { cookie = sr.ReadToEnd(); sr.Close(); } //string tmp = SRWebClient.GetPage("http://bbs.chinaz.com/Members.html?page=1&sort=CreateDate&desc=true&keyword=", Encoding.UTF8, cookie); int a = int.Parse(Console.ReadLine()); int b = int.Parse(Console.ReadLine()); string url = Console.ReadLine(); Hashtable hash = new Hashtable(); Encoding encoding = Encoding.GetEncoding(Console.ReadLine()); for (int i = a; i <= b; i++) { string html = SRWebClient.GetPage(string.Format(url, i), encoding, cookie); //Console.WriteLine(html); if (html != null && html.Length > 1000) { Match m = Regex.Match(html, @"\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.Compiled | RegexOptions.IgnoreCase); while (m != null && m.Value != null && m.Value.Trim() != string.Empty) { if (!hash.Contains(m.Value)) { Console.WriteLine(m.Value); using (StreamWriter sw = new StreamWriter("mail.txt", true)) { sw.WriteLine(m.Value); sw.Close(); } hash.Add(m.Value, string.Empty); } m = m.NextMatch(); } } } Console.Write("完成"); Console.ReadLine(); } } public class SRWebClient { public CookieCollection cookie; public SRWebClient() { cookie = null; } #region 从包含多个 Cookie 的字符串读取到 CookieCollection 集合中 private static void AddCookieWithCookieHead(ref CookieCollection cookieCol, string cookieHead, string defaultDomain) { if (cookieCol == null) cookieCol = new CookieCollection(); if (cookieHead == null) return; string[] ary = cookieHead.Split(';'); for (int i = 0; i < ary.Length; i++) { Cookie ck = GetCookieFromString(ary[i].Trim(), defaultDomain); if (ck != null) { cookieCol.Add(ck); } } } #endregion #region 读取某一个 Cookie 字符串到 Cookie 变量中 private static Cookie GetCookieFromString(string cookieString, string defaultDomain) { string[] ary = cookieString.Split(','); Hashtable hs = new Hashtable(); for (int i = 0; i < ary.Length; i++) { string s = ary[i].Trim(); int index = s.IndexOf("="); if (index > 0) { hs.Add(s.Substring(0, index), s.Substring(index + 1)); } } Cookie ck = new Cookie(); foreach (object Key in hs.Keys) { if (Key.ToString() == "path") ck.Path = hs[Key].ToString(); else if (Key.ToString() == "expires") { //ck.Expires=DateTime.Parse(hs[Key].ToString(); } else if (Key.ToString() == "domain") ck.Domain = hs[Key].ToString(); else { ck.Name = Key.ToString(); ck.Value = hs[Key].ToString(); } } if (ck.Name == "") return null; if (ck.Domain == "") ck.Domain = defaultDomain; return ck; } #endregion /**/ /// <TgData> /// <Alias>下载Web源代码</Alias> /// </TgData> public string DownloadHtml(string URL, bool CreateCookie) { try { HttpWebRequest request = HttpWebRequest.Create(URL) as HttpWebRequest; if (cookie != null) { request.CookieContainer = new CookieContainer(); request.CookieContainer.Add(cookie); } request.AllowAutoRedirect = false; //request.MaximumAutomaticRedirections = 3; request.Timeout = 20000; HttpWebResponse res = (HttpWebResponse)request.GetResponse(); string r = ""; System.IO.StreamReader S1 = new System.IO.StreamReader(res.GetResponseStream(), System.Text.Encoding.Default); try { r = S1.ReadToEnd(); if (CreateCookie) cookie = res.Cookies; } catch (Exception er) { //Log l = new Log(); //l.writelog("下载Web错误", er.ToString()); } finally { res.Close(); S1.Close(); } return r; } catch { } return string.Empty; } /**/ /// <TgData> /// <Alias>下载文件</Alias> /// </TgData> public long DownloadFile(string FileURL, string FileSavePath, bool CreateCookie) { long Filelength = 0; HttpWebRequest req = HttpWebRequest.Create(FileURL) as HttpWebRequest; if (cookie != null) { req.CookieContainer = new CookieContainer(); req.CookieContainer.Add(cookie); } req.AllowAutoRedirect = true; HttpWebResponse res = req.GetResponse() as HttpWebResponse; if (CreateCookie) cookie = res.Cookies; System.IO.Stream stream = res.GetResponseStream(); try { Filelength = res.ContentLength; byte[] b = new byte[512]; int nReadSize = 0; nReadSize = stream.Read(b, 0, 512); System.IO.FileStream fs = System.IO.File.Create(FileSavePath); try { while (nReadSize > 0) { fs.Write(b, 0, nReadSize); nReadSize = stream.Read(b, 0, 512); } } finally { fs.Close(); } } catch (Exception er) { //Log l = new Log(); //l.writelog("下载文件错误", er.ToString()); } finally { res.Close(); stream.Close(); } return Filelength; } /**/ /// <TgData> /// <Alias>提交数据</Alias> /// </TgData> public string Request(string RequestPageURL, RequestData Data, bool CreateCookie) { StreamReader reader = null; HttpWebResponse response = null; HttpWebRequest request = null; try { string StrUrl = RequestPageURL; request = HttpWebRequest.Create(StrUrl) as HttpWebRequest; string postdata = Data.GetData(); request.Referer = RequestPageURL; request.AllowAutoRedirect = false; request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; Maxthon; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"; request.Timeout = 20000; if (cookie != null) { request.CookieContainer = new CookieContainer(); request.CookieContainer.Add(cookie); } Uri u = new Uri(StrUrl); if (postdata.Length > 0) //包含要提交的数据 就使用Post方式 { request.ContentType = "application/x-www-form-urlencoded"; //作为表单请求 request.Method = "POST"; //方式就是Post //把提交的数据换成字节数组 Byte[] B = System.Text.Encoding.UTF8.GetBytes(postdata); request.ContentLength = B.Length; System.IO.Stream SW = request.GetRequestStream(); //开始提交数据 SW.Write(B, 0, B.Length); SW.Close(); } response = request.GetResponse() as HttpWebResponse; if (CreateCookie) //cookie = response.Cookies; AddCookieWithCookieHead(ref cookie, response.Headers["Set-Cookie"], request.RequestUri.Host); reader = new StreamReader(response.GetResponseStream(), Encoding.Default); return reader.ReadToEnd(); } catch (Exception ex) { string x = ex.StackTrace; } finally { if (response != null) response.Close(); } return string.Empty; } public bool PostDownload(RequestData Data, out string file) { file = null; StreamReader reader = null; HttpWebResponse response = null; HttpWebRequest request = null; try { string StrUrl = "http://www.imobile.com.cn/wapdiyringdownload.php"; request = HttpWebRequest.Create(StrUrl) as HttpWebRequest; string postdata = Data.GetData(); request.Referer = StrUrl; request.AllowAutoRedirect = false; request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; Maxthon; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"; request.Timeout = 20000; if (cookie != null) { request.CookieContainer = new CookieContainer(); request.CookieContainer.Add(cookie); } Uri u = new Uri(StrUrl); if (postdata.Length > 0) //包含要提交的数据 就使用Post方式 { request.ContentType = "application/x-www-form-urlencoded"; //作为表单请求 request.Method = "POST"; //方式就是Post //把提交的数据换成字节数组 Byte[] B = System.Text.Encoding.UTF8.GetBytes(postdata); request.ContentLength = B.Length; System.IO.Stream SW = request.GetRequestStream(); //开始提交数据 SW.Write(B, 0, B.Length); SW.Close(); } response = request.GetResponse() as HttpWebResponse; string des = response.Headers["Content-Disposition"].Trim(); file = des.Substring(des.IndexOf("filename=") + 9); file = new Random().Next(100).ToString() + "/" + file; System.IO.Stream stream = response.GetResponseStream(); try { int Filelength = (int)response.ContentLength; byte[] b = new byte[512]; int nReadSize = 0; nReadSize = stream.Read(b, 0, 512); System.IO.FileStream fs = System.IO.File.Create("f:/mobileMusic/" + file); try { while (nReadSize > 0) { fs.Write(b, 0, nReadSize); nReadSize = stream.Read(b, 0, 512); } } finally { fs.Close(); } } catch (Exception er) { //Log l = new Log(); //l.writelog("下载文件错误", er.ToString()); } finally { response.Close(); stream.Close(); } } catch (Exception ex) { string x = ex.StackTrace; } finally { if (response != null) response.Close(); } return true; } #region GetPage /// <summary> /// 获取源代码 /// </summary> /// <param name="url"></param> /// <param name="coding"></param> /// <param name="TryCount"></param> /// <returns></returns> public static string GetPage(string url, Encoding encoding, int TryCount) { for (int i = 0; i < TryCount; i++) { string result = GetPage(url, encoding, null); if (result != null && result != string.Empty) return result; } return string.Empty; } /// <summary> /// 获取源代码 /// </summary> /// <param name="url"></param> /// <param name="coding"></param> /// <returns></returns> public static string GetPage(string url, Encoding encoding, string cookie) { HttpWebRequest request = null; HttpWebResponse response = null; StreamReader reader = null; try { request = (HttpWebRequest)WebRequest.Create(url); request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2;)"; request.Timeout = 20000; request.AllowAutoRedirect = false; if (cookie != null) request.Headers["Cookie"] = cookie; response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024) { reader = new StreamReader(response.GetResponseStream(), encoding); string html = reader.ReadToEnd(); return html; } } catch { } finally { if (response != null) { response.Close(); response = null; } if (reader != null) reader.Close(); if (request != null) request = null; } return string.Empty; } #endregion } public class RequestData { Hashtable hash = new Hashtable(); public RequestData() { } public string GetData() { string r = ""; foreach (string key in hash.Keys) { if (r.Length > 0) r += "&"; r += key + "=" + hash[key]; } return r; } public void AddField(string Field, string Value) { hash[Field] = Value; } } }