Fork me on GitHub

[爬虫]抓取知乎百万用户信息之自建代理池

             点击我前往Github查看源代码   别忘记star

本项目github地址:https://github.com/wangqifan/ZhiHu     

如果你觉得服务商的服务太贵,可以考虑自建一个代理池。云代理推荐阿布云:https://www.abuyun.com/

应用场景

   爬虫过于频繁的抓取网站信息会被反爬虫机制屏蔽掉,或者有些网站对我们的Ip有限制,一个IP之能操作一次,这个时候就需要设置代理了。这方面需求还是很大的,有专门的服务商提供代理,没钱的自己动手打造一个代理池吧。

 

所用的工具

 

  Redis的C#驱动-ServiceStack.Redis

 

  Html解析-HtmlAgilityPack  任务调度-Quartz.NET

 

基本原理

 

  部分网站上有免费的代理IP信息,比如xicidaili.com,proxy360.cn。这些网站有很多免费代理IP,然而有些质量不好,需要程序及时从代理池中删掉质量低的代理,不断加入优质代理。

  思路来自知乎-https://www.zhihu.com/question/25566731

原理示意图

 接下来代码实现

创建一个ProxyPool的控制台应用程序,并使用NuGET添加ServiceStack.RedisHtmlAgilityPack,Quartz.NET包

   创建一个Proxy类


 public class Proxy
    {
        [Key]
        public string Adress { get; set; }
        public int port { get; set; }
    }
代理池的管理

封装一个资源获取方法

 


  public string DownloadHtml(string url)
        {
            string source = string.Empty;
            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0";
                using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
                {
                    using (Stream dataStream = response.GetResponseStream())
                    {
                        if (response.ContentEncoding.ToLower().Contains("gzip"))//解压
                        {
                            using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress))
                            {
                                using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
                                {
                                    source = reader.ReadToEnd();
                                }
                            }
                        }
                        else if (response.ContentEncoding.ToLower().Contains("deflate"))//解压
                        {
                            using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress))
                            {
                                using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
                                {
                                    source = reader.ReadToEnd();
                                }

                            }
                        }
                        else
                        {
                            using (Stream stream = response.GetResponseStream())//原始
                            {
                                using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
                                {

                                    source = reader.ReadToEnd();
                                }
                            }
                        }
                    }
                }
                request.Abort();
            }
            catch
            {

            }
            return source;
         
        }

 



检测代理是否有效
 public static bool IsAvailable(Proxy proxy)
        {
            bool result = false;
            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create("https://www.baidu.com/");
                request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0";
                WebProxy webproxy=new WebProxy(proxy.Adress,proxy.port);
                request.Proxy=webproxy;
                request.Timeout = 1000;
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                using (Stream dataStream = response.GetResponseStream())
                {
                    using (StreamReader reader = new StreamReader(dataStream, Encoding.UTF8))
                    {
                        if (reader.ReadToEnd().Contains("百度"))
                        {
                            result = true;
                        }
                      
                    }
                }
                request.Abort();
            }
            catch
            {
                
            }
            return result;
        }

 

将代理添加到Redis的hash表

pblic void Add(Proxy proxy)
        {
            using (RedisClient client = new RedisClient("127.0.0.1", 6379))
            {
                if (IsAvailable(proxy))
                {
                    Console.WriteLine(proxy.Adress);
                    client.AddItemToSet("ProxyPool", proxy.Adress + ":" + proxy.port.ToString());
                }
            }

        }

下载西刺代理

  public void Downloadxicidaili(object DATA)//下载西刺代理的html页面
        {
            try
            {
                List<string> list = new List<string>()
                {
                    "http://www.xicidaili.com/nt/",
                    "http://www.xicidaili.com/nn/",
                    "http://www.xicidaili.com/wn/",
                    "http://www.xicidaili.com/wt/"

                };
                foreach (var utlitem in list)
                {
                    string url = utlitem;
                    string html = DownloadHtml(url);

                    HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                    doc.LoadHtml(html);
                    HtmlNode node = doc.DocumentNode;
                    string xpathstring = "//tr[@class='odd']";
                    HtmlNodeCollection collection = node.SelectNodes(xpathstring);
                    foreach (var item in collection)
                    {
                        Proxy proxy = new Proxy();
                        string xpath = "td[2]";
                        proxy.Adress = item.SelectSingleNode(xpath).InnerHtml;
                        xpath = "td[3]";
                        proxy.port = int.Parse(item.SelectSingleNode(xpath).InnerHtml);
                        Console.WriteLine(proxy.Adress);
                        Add(proxy);

                    }
                }
              
                Console.WriteLine("西刺");
            }catch
            {

            }
        }

 

快代理

public void Downkuaidaili(object DATA)//下载快代理
        {
            try
            {
           
                string url = "http://www.xicidaili.com/nt/";
                for (int i = 1; i < 4; i++)
                {
                    string html = DownloadHtml(url+i.ToString());
                    string xpath = "//tbody/tr";
                    HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                    doc.LoadHtml(html);
                    HtmlNode node = doc.DocumentNode;
                    HtmlNodeCollection collection = node.SelectNodes(xpath);
                    foreach (var item in collection)
                    {
                        Proxy proxy = new Proxy();
                        proxy.Adress = item.FirstChild.InnerHtml;
                        xpath = "td[2]";
                        proxy.port = int.Parse(item.SelectSingleNode(xpath).InnerHtml);
                        Console.WriteLine(proxy.Adress);
                        Add(proxy);
                    }
                }
                
            }
            catch
            {

            }
           
        }

Proxy360

 public void Downloadproxy360(object DATA)//下载proxy360
        {
            try
            {
                string url = "http://www.proxy360.cn/default.aspx";
                string html = DownloadHtml(url);
                HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
                doc.LoadHtml(html);
                string xpathstring = "//div[@class='proxylistitem']";
                HtmlNode node = doc.DocumentNode;
                HtmlNodeCollection collection = node.SelectNodes(xpathstring);
                foreach (var item in collection)
                {
                    Proxy proxy = new Proxy();
                    var childnode = item.ChildNodes[1];
                    xpathstring = "span[1]";
                    proxy.Adress = childnode.SelectSingleNode(xpathstring).InnerHtml.Trim();
                    xpathstring = "span[2]";
                    proxy.port = int.Parse(childnode.SelectSingleNode(xpathstring).InnerHtml);
                    Console.WriteLine(proxy.Adress);
                    Add(proxy);
                }
                Console.WriteLine("proxy360");
            }
            catch
            {

            }
        }

 

 多线程爬取
  public void Initial()
        {
            ThreadPool.QueueUserWorkItem(Downloadxicidaili);
            ThreadPool.QueueUserWorkItem(Downkuaidaili);
            ThreadPool.QueueUserWorkItem(Downloadproxy360);
         
            
        }

 删除接口和随机获取接口

 public  class Pool
    {
         public static string  GetProxy()
         {
             string result=string.Empty;
         
             try
             {
                 using (RedisClient client = new RedisClient("59.74.169.57", 6379))
                 {
                     result = client.GetRandomItemFromSet("ProxyPool");
                 }
             }
             catch { 
             }
             return result;
          
         }
         public static void  PushProxy(string value)
         {
             try
             {
                 using (RedisClient client = new RedisClient("59.74.169.57", 6379))
                 {
                     client.RemoveItemFromSet("ProxyPool", value);
                 }
             }
             catch
             {
                 Console.WriteLine("删除代理失败!");
             }
         }
    }

不断检测代理池
 public void TestAll()
        {
            while(true)
            {
                string Proxy= Pool.GetProxy();
                if (!string.IsNullOrEmpty(Proxy))
                {
                    Proxy webproxy = new ProxyPool.Proxy();
                    int index = Proxy.IndexOf(":");
                    webproxy.Adress = Proxy.Substring(0, index);
                    webproxy.port = int.Parse(Proxy.Substring(index + 1, Proxy.Length - index - 1));
                    if (!IsAvailable(webproxy))
                    {
                        Pool.PushProxy(Proxy);
                    }
                }
                Thread.Sleep(500);
            }
        }
利用Quartz.net对任务进行定时调度
job类
 class TotalJob:IJob
    {
        public void Execute(IJobExecutionContext context)
        {
            PoolManage manager = new PoolManage();
            manager.Initial();
        }
    }
任务声明
 static void Main(string[] args)
        {
            Run();    
            Console.WriteLine("Press any key to close the application");
        }
        private static void Run()
        {
            try
            {
                StdSchedulerFactory factory = new StdSchedulerFactory();
                IScheduler scheduler = factory.GetScheduler();
                scheduler.Start();
                IJobDetail job = JobBuilder.Create<TotalJob>().WithIdentity("job1", "group1").Build();
                ITrigger trigger = TriggerBuilder.Create()
                 .WithIdentity("trigger1", "group1")
                 .StartNow()
                 .WithSimpleSchedule(
                 x => x
                .WithIntervalInMinutes(1)
                 .RepeatForever()
                ).Build();
                scheduler.ScheduleJob(job, trigger);
          
            }
            catch (SchedulerException se)
            {
                Console.WriteLine(se);
            }
        }

 

 

 

 Github地址:https://github.com/wangqifan/ProxyPool
posted @ 2017-01-08 11:15  王起帆  阅读(7035)  评论(6编辑  收藏  举报