[爬虫]抓取知乎百万用户信息之自建代理池
点击我前往Github查看源代码 别忘记star
本项目github地址:https://github.com/wangqifan/ZhiHu
如果你觉得服务商的服务太贵,可以考虑自建一个代理池。云代理推荐阿布云:https://www.abuyun.com/
应用场景
爬虫过于频繁的抓取网站信息会被反爬虫机制屏蔽掉,或者有些网站对我们的Ip有限制,一个IP之能操作一次,这个时候就需要设置代理了。这方面需求还是很大的,有专门的服务商提供代理,没钱的自己动手打造一个代理池吧。
所用的工具
Redis的C#驱动-ServiceStack.Redis
Html解析-HtmlAgilityPack 任务调度-Quartz.NET
基本原理
部分网站上有免费的代理IP信息,比如xicidaili.com,proxy360.cn。这些网站有很多免费代理IP,然而有些质量不好,需要程序及时从代理池中删掉质量低的代理,不断加入优质代理。
思路来自知乎-https://www.zhihu.com/question/25566731
原理示意图
接下来代码实现
创建一个ProxyPool的控制台应用程序,并使用NuGET添加ServiceStack.Redis,HtmlAgilityPack,Quartz.NET包
创建一个Proxy类
public class Proxy { [Key] public string Adress { get; set; } public int port { get; set; } }
封装一个资源获取方法
public string DownloadHtml(string url) { string source = string.Empty; try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0"; using (HttpWebResponse response = (HttpWebResponse)request.GetResponse()) { using (Stream dataStream = response.GetResponseStream()) { if (response.ContentEncoding.ToLower().Contains("gzip"))//解压 { using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress)) { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { source = reader.ReadToEnd(); } } } else if (response.ContentEncoding.ToLower().Contains("deflate"))//解压 { using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress)) { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { source = reader.ReadToEnd(); } } } else { using (Stream stream = response.GetResponseStream())//原始 { using (StreamReader reader = new StreamReader(stream, Encoding.UTF8)) { source = reader.ReadToEnd(); } } } } } request.Abort(); } catch { } return source; }
检测代理是否有效
public static bool IsAvailable(Proxy proxy) { bool result = false; try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create("https://www.baidu.com/"); request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0"; WebProxy webproxy=new WebProxy(proxy.Adress,proxy.port); request.Proxy=webproxy; request.Timeout = 1000; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); using (Stream dataStream = response.GetResponseStream()) { using (StreamReader reader = new StreamReader(dataStream, Encoding.UTF8)) { if (reader.ReadToEnd().Contains("百度")) { result = true; } } } request.Abort(); } catch { } return result; }
将代理添加到Redis的hash表
pblic void Add(Proxy proxy) { using (RedisClient client = new RedisClient("127.0.0.1", 6379)) { if (IsAvailable(proxy)) { Console.WriteLine(proxy.Adress); client.AddItemToSet("ProxyPool", proxy.Adress + ":" + proxy.port.ToString()); } } }
下载西刺代理
public void Downloadxicidaili(object DATA)//下载西刺代理的html页面 { try { List<string> list = new List<string>() { "http://www.xicidaili.com/nt/", "http://www.xicidaili.com/nn/", "http://www.xicidaili.com/wn/", "http://www.xicidaili.com/wt/" }; foreach (var utlitem in list) { string url = utlitem; string html = DownloadHtml(url); HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); HtmlNode node = doc.DocumentNode; string xpathstring = "//tr[@class='odd']"; HtmlNodeCollection collection = node.SelectNodes(xpathstring); foreach (var item in collection) { Proxy proxy = new Proxy(); string xpath = "td[2]"; proxy.Adress = item.SelectSingleNode(xpath).InnerHtml; xpath = "td[3]"; proxy.port = int.Parse(item.SelectSingleNode(xpath).InnerHtml); Console.WriteLine(proxy.Adress); Add(proxy); } } Console.WriteLine("西刺"); }catch { } }
快代理
public void Downkuaidaili(object DATA)//下载快代理 { try { string url = "http://www.xicidaili.com/nt/"; for (int i = 1; i < 4; i++) { string html = DownloadHtml(url+i.ToString()); string xpath = "//tbody/tr"; HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); HtmlNode node = doc.DocumentNode; HtmlNodeCollection collection = node.SelectNodes(xpath); foreach (var item in collection) { Proxy proxy = new Proxy(); proxy.Adress = item.FirstChild.InnerHtml; xpath = "td[2]"; proxy.port = int.Parse(item.SelectSingleNode(xpath).InnerHtml); Console.WriteLine(proxy.Adress); Add(proxy); } } } catch { } }
Proxy360
public void Downloadproxy360(object DATA)//下载proxy360 { try { string url = "http://www.proxy360.cn/default.aspx"; string html = DownloadHtml(url); HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); string xpathstring = "//div[@class='proxylistitem']"; HtmlNode node = doc.DocumentNode; HtmlNodeCollection collection = node.SelectNodes(xpathstring); foreach (var item in collection) { Proxy proxy = new Proxy(); var childnode = item.ChildNodes[1]; xpathstring = "span[1]"; proxy.Adress = childnode.SelectSingleNode(xpathstring).InnerHtml.Trim(); xpathstring = "span[2]"; proxy.port = int.Parse(childnode.SelectSingleNode(xpathstring).InnerHtml); Console.WriteLine(proxy.Adress); Add(proxy); } Console.WriteLine("proxy360"); } catch { } }
多线程爬取
public void Initial() { ThreadPool.QueueUserWorkItem(Downloadxicidaili); ThreadPool.QueueUserWorkItem(Downkuaidaili); ThreadPool.QueueUserWorkItem(Downloadproxy360); }
删除接口和随机获取接口
public class Pool { public static string GetProxy() { string result=string.Empty; try { using (RedisClient client = new RedisClient("59.74.169.57", 6379)) { result = client.GetRandomItemFromSet("ProxyPool"); } } catch { } return result; } public static void PushProxy(string value) { try { using (RedisClient client = new RedisClient("59.74.169.57", 6379)) { client.RemoveItemFromSet("ProxyPool", value); } } catch { Console.WriteLine("删除代理失败!"); } } }
不断检测代理池
public void TestAll() { while(true) { string Proxy= Pool.GetProxy(); if (!string.IsNullOrEmpty(Proxy)) { Proxy webproxy = new ProxyPool.Proxy(); int index = Proxy.IndexOf(":"); webproxy.Adress = Proxy.Substring(0, index); webproxy.port = int.Parse(Proxy.Substring(index + 1, Proxy.Length - index - 1)); if (!IsAvailable(webproxy)) { Pool.PushProxy(Proxy); } } Thread.Sleep(500); } }
job类
class TotalJob:IJob { public void Execute(IJobExecutionContext context) { PoolManage manager = new PoolManage(); manager.Initial(); } }
任务声明
static void Main(string[] args) { Run(); Console.WriteLine("Press any key to close the application"); } private static void Run() { try { StdSchedulerFactory factory = new StdSchedulerFactory(); IScheduler scheduler = factory.GetScheduler(); scheduler.Start(); IJobDetail job = JobBuilder.Create<TotalJob>().WithIdentity("job1", "group1").Build(); ITrigger trigger = TriggerBuilder.Create() .WithIdentity("trigger1", "group1") .StartNow() .WithSimpleSchedule( x => x .WithIntervalInMinutes(1) .RepeatForever() ).Build(); scheduler.ScheduleJob(job, trigger); } catch (SchedulerException se) { Console.WriteLine(se); } }
Github地址:https://github.com/wangqifan/ProxyPool