记一次企业级爬虫系统升级改造(六):基于Redis实现免费的IP代理池
前言:
首先表示抱歉,春节后一直较忙,未及时更新该系列文章。
近期,由于监控的站源越来越多,就偶有站源做了反爬机制,造成我们的SupportYun系统小爬虫服务时常被封IP,不能进行数据采集。
这时候,前面有园友提到的IP代理就该上场表演了。
IP代理池设计:
博主查阅与调研了多方资料,最终决定先通过爬取网络上各大IP代理网站免费代理的方式,来建立自己的IP代理池。
最终爬取了五家较为优质的IP代理站点:
1.西刺代理
2.快代理
3.逼格代理
4.proxy360
5.66免费代理
IP代理池方案设计如下:
简单点说就是把在采集的站源里面已知具有反爬机制的站源打上标签,修改所有的爬虫服务,遇到有此标签的站源先从IP代理池随机获取可用的代理IP再进行数据爬取。
安装Redis:
首先,我们需要一台服务器来部署我们的Redis服务(先不考虑集群什么的)。
博主一向不喜欢弹个小黑框,不停敲命令行进行操作的各种方式。个人认为,GUI是推动计算机快速发展的重要因素之一(非喜勿喷)。
翻阅了资料,找到了简易的redis安装客户端(windows版本,安装简单到爆),地址如下:
http://download.csdn.net/detail/cb511612371/9784687
在博客园找到一篇介绍redis配置文件的博文,贴出来供大家参考:http://www.cnblogs.com/kreo/p/4423362.html
话说博主就简单的修改了一下内存限制,设置了允许外网连接,设置了一个密码,也没多改其他东西。
注意,配置文件在安装完成后的目录下,名称是:Redis.window-server.conf
熟悉一点都知道,redis的c#驱动ServiceStack.Redis,NuGet就可以直接安装。比较坑的是4.0版本后商业化了,限制每小时6000次,要么下载3.9版本,要么考虑其他的驱动,例如:StackExchange。
博主使用的是ServiceStack V3.9版本,附上下载地址:http://download.csdn.net/detail/cb511612371/9784626
下面附上博主基于ServiceStack写的RedisManageService,由于业务简单,只使用到了几个API,大家凑合着看。
1 /// <summary> 2 /// 基于ServiceStack的redis操作管理服务 3 /// 当前用到set存储 4 /// </summary> 5 public class RedisManageService 6 { 7 private static readonly string redisAddress = ConfigurationManager.AppSettings["RedisAddress"]; 8 private static readonly string redisPassword = "myRedisPassword"; 9 10 11 /// <summary> 12 /// 获取某set集合 随机一条数据 13 /// </summary> 14 /// <param name="setName"></param> 15 /// <returns></returns> 16 public static string GetRandomItemFromSet(RedisSetNameEnum setName) 17 { 18 using (RedisClient client = new RedisClient(redisAddress, 6379, redisPassword)) 19 { 20 var result = client.GetRandomItemFromSet(setName.ToString()); 21 if (result == null) 22 { 23 throw new Exception("redis set集合"+setName.ToString()+"已无数据!"); 24 } 25 return result; 26 } 27 } 28 29 /// <summary> 30 /// 从某set集合 删除指定数据 31 /// </summary> 32 /// <param name="setName"></param> 33 /// <param name="value"></param> 34 /// <returns></returns> 35 public static void RemoveItemFromSet(RedisSetNameEnum setName, string value) 36 { 37 using (RedisClient client = new RedisClient(redisAddress, 6379, redisPassword)) 38 { 39 client.RemoveItemFromSet(setName.ToString(), value); 40 } 41 } 42 43 /// <summary> 44 /// 添加一条数据到某set集合 45 /// </summary> 46 /// <param name="setName"></param> 47 /// <param name="value"></param> 48 public static void AddItemToSet(RedisSetNameEnum setName, string value) 49 { 50 using (RedisClient client = new RedisClient(redisAddress, 6379, redisPassword)) 51 { 52 client.AddItemToSet(setName.ToString(), value); 53 } 54 } 55 56 /// <summary> 57 /// 添加一个列表到某set集合 58 /// </summary> 59 /// <param name="setName"></param> 60 /// <param name="values"></param> 61 public static void AddItemListToSet(RedisSetNameEnum setName, List<string> values) 62 { 63 using (RedisClient client = new RedisClient(redisAddress, 6379, redisPassword)) 64 { 65 client.AddRangeToSet(setName.ToString(), values); 66 } 67 } 68 69 /// <summary> 70 /// 判断某值是否已存在某set集合中 71 /// </summary> 72 /// <param name="setName"></param> 73 /// <param name="value"></param> 74 /// <returns></returns> 75 public static bool JudgeItemInSet(RedisSetNameEnum setName, string value) 76 { 77 using (RedisClient client = new RedisClient(redisAddress, 6379, redisPassword)) 78 { 79 return client.Sets[setName.ToString()].Any(t => t == value); 80 } 81 } 82 83 /// <summary> 84 /// 获取某set数据总数 85 /// </summary> 86 /// <param name="setName"></param> 87 /// <returns></returns> 88 public static long GetSetCount(RedisSetNameEnum setName) 89 { 90 using (RedisClient client = new RedisClient(redisAddress, 6379, redisPassword)) 91 { 92 return client.GetSetCount(setName.ToString()); 93 } 94 } 95 }
免费代理IP抓取服务实现:
我们首先设计一个最简单的IpProxy对象:
1 /// <summary> 2 /// Ip代理对象 3 /// </summary> 4 public class IpProxy 5 { 6 /// <summary> 7 /// IP地址 8 /// </summary> 9 public string Address { get; set; } 10 11 /// <summary> 12 /// 端口 13 /// </summary> 14 public int Port { get; set; } 15 }
然后实现一个基于Redis的Ip代理池操作服务:
1 /// <summary> 2 /// 基于Redis的代理池管理服务 3 /// </summary> 4 public class PoolManageService 5 { 6 /// <summary> 7 /// 从代理池随机获取一条代理 8 /// </summary> 9 /// <returns></returns> 10 public static string GetProxy() 11 { 12 string result = string.Empty; 13 14 try 15 { 16 result = RedisManageService.GetRandomItemFromSet(RedisSetNameEnum.ProxyPool); 17 if (result != null) 18 { 19 if ( 20 !HttpHelper.IsAvailable(result.Split(new[] { ':' })[0], 21 int.Parse(result.Split(new[] { ':' })[1]))) 22 { 23 DeleteProxy(result); 24 return GetProxy(); 25 } 26 } 27 } 28 catch (Exception e) 29 { 30 LogUtils.ErrorLog(new Exception("从代理池获取代理数据出错", e)); 31 } 32 return result; 33 } 34 35 /// <summary> 36 /// 从代理池删除一条代理 37 /// </summary> 38 /// <param name="value"></param> 39 public static void DeleteProxy(string value) 40 { 41 try 42 { 43 RedisManageService.RemoveItemFromSet(RedisSetNameEnum.ProxyPool, value); 44 } 45 catch (Exception e) 46 { 47 LogUtils.ErrorLog(new Exception("从代理池删除代理数据出错", e)); 48 } 49 } 50 51 /// <summary> 52 /// 添加一条代理到代理池 53 /// </summary> 54 /// <param name="proxy"></param> 55 public static void Add(IpProxy proxy) 56 { 57 try 58 { 59 if (HttpHelper.IsAvailable(proxy.Address, proxy.Port)) 60 { 61 RedisManageService.AddItemToSet(RedisSetNameEnum.ProxyPool, proxy.Address + ":" + proxy.Port.ToString()); 62 } 63 } 64 catch (Exception e) 65 { 66 LogUtils.ErrorLog(new Exception("添加一条代理数据到代理池出错", e)); 67 } 68 } 69 }
提供简易的三个方法:添加代理IP、删除代理IP、随机获取一条代理IP
我们还需要一个爬虫服务,来爬取我们需要的免费代理IP数据:
1 /// <summary> 2 /// IP池 抓取蜘蛛 3 /// TODO:代理池站点变化较快,时常关注日志监控 4 /// </summary> 5 public class IpPoolSpider 6 { 7 public void Initial() 8 { 9 ThreadPool.QueueUserWorkItem(Downloadproxy360); 10 ThreadPool.QueueUserWorkItem(DownloadproxyBiGe); 11 ThreadPool.QueueUserWorkItem(Downloadproxy66); 12 ThreadPool.QueueUserWorkItem(Downloadxicidaili); 13 } 14 15 // 下载西刺代理的html页面 16 public void Downloadxicidaili(object DATA) 17 { 18 try 19 { 20 List<string> list = new List<string>() 21 { 22 "http://www.xicidaili.com/nt/", 23 "http://www.xicidaili.com/nn/", 24 "http://www.xicidaili.com/wn/", 25 "http://www.xicidaili.com/wt/" 26 27 }; 28 foreach (var utlitem in list) 29 { 30 for (int i = 1; i < 5; i++) 31 { 32 string url = utlitem + i.ToString(); 33 var ipProxy = PoolManageService.GetProxy(); 34 if (string.IsNullOrEmpty(ipProxy)) 35 { 36 LogUtils.ErrorLog(new Exception("Ip代理池暂无可用代理IP")); 37 return; 38 } 39 var ip = ipProxy; 40 WebProxy webproxy; 41 if (ipProxy.Contains(":")) 42 { 43 ip = ipProxy.Split(new[] { ':' })[0]; 44 var port = int.Parse(ipProxy.Split(new[] { ':' })[1]); 45 webproxy = new WebProxy(ip, port); 46 } 47 else 48 { 49 webproxy = new WebProxy(ip); 50 } 51 string html = HttpHelper.DownloadHtml(url, webproxy); 52 if (string.IsNullOrEmpty(html)) 53 { 54 LogUtils.ErrorLog(new Exception("代理地址:" + url + " 访问失败")); 55 continue; 56 } 57 58 HtmlDocument doc = new HtmlDocument(); 59 doc.LoadHtml(html); 60 HtmlNode node = doc.DocumentNode; 61 string xpathstring = "//tr[@class='odd']"; 62 HtmlNodeCollection collection = node.SelectNodes(xpathstring); 63 foreach (var item in collection) 64 { 65 var proxy = new IpProxy(); 66 string xpath = "td[2]"; 67 proxy.Address = item.SelectSingleNode(xpath).InnerHtml; 68 xpath = "td[3]"; 69 proxy.Port = int.Parse(item.SelectSingleNode(xpath).InnerHtml); 70 Task.Run(() => 71 { 72 PoolManageService.Add(proxy); 73 }); 74 } 75 } 76 } 77 } 78 catch (Exception e) 79 { 80 LogUtils.ErrorLog(new Exception("下载西刺代理IP池出现故障", e)); 81 } 82 } 83 84 // 下载快代理 85 public void Downkuaidaili(object DATA) 86 { 87 try 88 { 89 string url = "http://www.kuaidaili.com/proxylist/"; 90 for (int i = 1; i < 4; i++) 91 { 92 string html = HttpHelper.DownloadHtml(url + i.ToString() + "/", null); 93 string xpath = "//tbody/tr"; 94 HtmlDocument doc = new HtmlDocument(); 95 doc.LoadHtml(html); 96 HtmlNode node = doc.DocumentNode; 97 HtmlNodeCollection collection = node.SelectNodes(xpath); 98 foreach (var item in collection) 99 { 100 var proxy = new IpProxy(); 101 proxy.Address = item.FirstChild.InnerHtml; 102 xpath = "td[2]"; 103 proxy.Port = int.Parse(item.SelectSingleNode(xpath).InnerHtml); 104 Task.Run(() => 105 { 106 PoolManageService.Add(proxy); 107 }); 108 } 109 } 110 } 111 catch (Exception e) 112 { 113 LogUtils.ErrorLog(new Exception("下载快代理IP池出现故障", e)); 114 } 115 } 116 117 // 下载proxy360 118 public void Downloadproxy360(object DATA) 119 { 120 try 121 { 122 string url = "http://www.proxy360.cn/default.aspx"; 123 string html = HttpHelper.DownloadHtml(url, null); 124 if (string.IsNullOrEmpty(html)) 125 { 126 LogUtils.ErrorLog(new Exception("代理地址:" + url + " 访问失败")); 127 return; 128 } 129 HtmlDocument doc = new HtmlDocument(); 130 doc.LoadHtml(html); 131 string xpathstring = "//div[@class='proxylistitem']"; 132 HtmlNode node = doc.DocumentNode; 133 HtmlNodeCollection collection = node.SelectNodes(xpathstring); 134 135 foreach (var item in collection) 136 { 137 var proxy = new IpProxy(); 138 var childnode = item.ChildNodes[1]; 139 xpathstring = "span[1]"; 140 proxy.Address = childnode.SelectSingleNode(xpathstring).InnerHtml.Trim(); 141 xpathstring = "span[2]"; 142 proxy.Port = int.Parse(childnode.SelectSingleNode(xpathstring).InnerHtml); 143 Task.Run(() => 144 { 145 PoolManageService.Add(proxy); 146 }); 147 } 148 } 149 catch (Exception e) 150 { 151 LogUtils.ErrorLog(new Exception("下载proxy360IP池出现故障", e)); 152 } 153 } 154 155 // 下载逼格代理 156 public void DownloadproxyBiGe(object DATA) 157 { 158 try 159 { 160 List<string> list = new List<string>() 161 { 162 "http://www.bigdaili.com/dailiip/1/{0}.html", 163 "http://www.bigdaili.com/dailiip/2/{0}.html", 164 "http://www.bigdaili.com/dailiip/3/{0}.html", 165 "http://www.bigdaili.com/dailiip/4/{0}.html" 166 }; 167 foreach (var utlitem in list) 168 { 169 for (int i = 1; i < 5; i++) 170 { 171 string url = String.Format(utlitem, i); 172 string html = HttpHelper.DownloadHtml(url, null); 173 if (string.IsNullOrEmpty(html)) 174 { 175 LogUtils.ErrorLog(new Exception("代理地址:" + url + " 访问失败")); 176 continue; 177 } 178 179 HtmlDocument doc = new HtmlDocument(); 180 doc.LoadHtml(html); 181 HtmlNode node = doc.DocumentNode; 182 string xpathstring = "//tbody/tr"; 183 HtmlNodeCollection collection = node.SelectNodes(xpathstring); 184 foreach (var item in collection) 185 { 186 var proxy = new IpProxy(); 187 var xpath = "td[1]"; 188 proxy.Address = item.SelectSingleNode(xpath).InnerHtml; 189 xpath = "td[2]"; 190 proxy.Port = int.Parse(item.SelectSingleNode(xpath).InnerHtml); 191 Task.Run(() => 192 { 193 PoolManageService.Add(proxy); 194 }); 195 } 196 } 197 } 198 } 199 catch (Exception e) 200 { 201 LogUtils.ErrorLog(new Exception("下载逼格代理IP池出现故障", e)); 202 } 203 } 204 205 // 下载66免费代理 206 public void Downloadproxy66(object DATA) 207 { 208 try 209 { 210 List<string> list = new List<string>() 211 { 212 "http://www.66ip.cn/areaindex_35/index.html", 213 "http://www.66ip.cn/areaindex_35/2.html", 214 "http://www.66ip.cn/areaindex_35/3.html" 215 }; 216 foreach (var utlitem in list) 217 { 218 string url = utlitem; 219 string html = HttpHelper.DownloadHtml(url, null); 220 if (string.IsNullOrEmpty(html)) 221 { 222 LogUtils.ErrorLog(new Exception("代理地址:" + url + " 访问失败")); 223 break; 224 } 225 226 HtmlDocument doc = new HtmlDocument(); 227 doc.LoadHtml(html); 228 HtmlNode node = doc.DocumentNode; 229 string xpathstring = "//table[@bordercolor='#6699ff']/tr"; 230 HtmlNodeCollection collection = node.SelectNodes(xpathstring); 231 foreach (var item in collection) 232 { 233 var proxy = new IpProxy(); 234 var xpath = "td[1]"; 235 proxy.Address = item.SelectSingleNode(xpath).InnerHtml; 236 if (proxy.Address.Contains("ip")) 237 { 238 continue; 239 } 240 xpath = "td[2]"; 241 proxy.Port = int.Parse(item.SelectSingleNode(xpath).InnerHtml); 242 Task.Run(() => 243 { 244 PoolManageService.Add(proxy); 245 }); 246 } 247 } 248 } 249 catch (Exception e) 250 { 251 LogUtils.ErrorLog(new Exception("下载66免费代理IP池出现故障", e)); 252 } 253 } 254 }
这段代码也没什么营养,就不仔细解释了。
前面有说到,博主的爬虫服务都是以windows服务的方式部署的。以前一直用Timer来实现固定间隔多次循环,这次博主引用了Quartz.NET任务调度框架来做,代码看起来更优美一点。
Quartz.NET可直接在NuGet下载安装。
先写一个代理池的总调度任务类ProxyPoolTotalJob,继承IJob接口:
1 /// <summary> 2 /// 代理池总调度任务 3 /// </summary> 4 class ProxyPoolTotalJob : IJob 5 { 6 public void Execute(IJobExecutionContext context) 7 { 8 var spider = new IpPoolSpider(); 9 spider.Initial(); 10 } 11 }
接下来是在OnStart中运行的Run()方法实现:
1 private static void Run() 2 { 3 try 4 { 5 StdSchedulerFactory factory = new StdSchedulerFactory(); 6 IScheduler scheduler = factory.GetScheduler(); 7 scheduler.Start(); 8 IJobDetail job = JobBuilder.Create<ProxyPoolTotalJob>().WithIdentity("job1", "group1").Build(); 9 ITrigger trigger = TriggerBuilder.Create() 10 .WithIdentity("trigger1", "group1") 11 .StartNow() 12 .WithSimpleSchedule( 13 x => x 14 .WithIntervalInMinutes(28) // 28分钟一次 15 .RepeatForever() 16 ).Build(); 17 scheduler.ScheduleJob(job, trigger); 18 19 } 20 catch (SchedulerException se) 21 { 22 Console.WriteLine(se); 23 } 24 }
最后采集具有反爬机制的html页面的时候,使用代理IP,这个相信大家都会,设置一下webRequest的Proxy参数即可。
webRequest.Proxy = new WebProxy(ip, port);
以上,就实现了一个基于redis的免费代理IP池。我们被封IP的爬虫服务又满血复活了,继续采集新数据去。
原创文章,代码都是从自己项目里贴出来的。转载请注明出处哦,亲~~~