python3.5爬虫实例:根据网站的反爬虫策略,启用代理来防止爬虫被禁用
网站反爬虫:一个IP频繁访问就先将该IP加入黑名单
反爬虫策略:限制IP访问频率,超过频率就自动断开:降低爬虫的速度,在每个请求前加time.sleep,或更换IP
策略二:后台对访问进行统计,如果单个userAgent访问超过阈值,予以封锁:误伤较大,一般网站不使用
策略三:针对cookies:一般网站不使用
本例利用反爬虫策略来抓取糗事百科的段子
1 #网站反爬虫:一个IP频繁访问就先将该IP加入黑名单 2 #反爬虫策略:限制IP访问频率,超过频率就自动断开:降低爬虫的速度,在每个请求前加time.sleep,或更换IP 3 #策略二:后台对访问进行统计,如果单个userAgent访问超过阈值,予以封锁:误伤较大,一般网站不使用 4 #策略三:针对cookies:一般网站不使用 5 6 import requests 7 import re 8 import random 9 import time 10 11 #首先,我们找一个发布代理IP的网站,从该网站爬取代理IP来访问网页,当本地IP失效,启用代理IP 12 13 class download(object): 14 def __init__(self): 15 self.ip_list=[] #初始化列表用来存储获取到的IP 16 html=requests.get("http://haoip.cc/tiqu.htm") 17 iplistn=re.findall(r'r/>(.*?)<b',html.text,re.S) #从html代码中获取所有/><b中的内容 re.S的意思是匹配包括所有换行符 18 for ip in iplistn: 19 i=re.sub("\n","",ip) #re.sub是re模块替换的方法,这表示将\n替换为空 20 self.ip_list.append(i.strip()) #将IP添加到初始化列表中 21 22 self.user_agent_list=[ 23 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 24 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 25 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 26 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 27 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 28 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 29 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 30 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 31 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 32 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 33 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 34 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 35 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 36 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 37 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 38 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 39 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 40 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 41 ] 42 def get(self,url,timeout,proxy=None,num_retries=6): 43 ua=random.choice(self.user_agent_list) #从user_agent_list中随机抽取出一个字符串 44 # print(ua) 45 header={"User-Agent":ua} #构造一个完整的User_Agent 46 47 if proxy==None: #当代理为空时,不使用代理获取response 48 try: 49 response=requests.get(url,headers=header,timeout=timeout) 50 return response 51 except: 52 if num_retries>0: 53 time.sleep(10) 54 print(u"获取网页错误,10s后将获取倒数第:",num_retries,u"次") 55 return self.get(url,timeout,num_retries-1) #调用自身并将次数减1 56 else: 57 print(u"开始使用代理") 58 time.sleep(10) 59 IP="".join(str(random.choice(self.ip_list)).strip()) 60 proxy={"http":IP} 61 return self.get(url,timeout,proxy) 62 63 else: 64 try: 65 IP="".join(str(random.choice(self.ip_list)).strip()) #随机取IP并去除空格 66 proxy={"http":IP} #构造一个代理 67 response=requests.get(url,headers=header,proxies=proxy,timeout=timeout) #使用代理来获取response 68 return response 69 except: 70 if num_retries>0: 71 time.sleep(10) 72 IP="".join(str(random.choice(self.ip_list)).strip()) 73 print(u"正在更换代理,10s后将重新获取第",num_retries,u"次") 74 print(u"当前代理是:",proxy) 75 return self.get(url,timeout,proxy,num_retries-1) 76 else: 77 print(u"代理发生错误,取消代理") 78 return self.get(url,3) 79 80 request=download();
实现段子抓取
1 #模拟抓取糗事百科的段子 2 import requests 3 from bs4 import BeautifulSoup 4 from Download import request 5 def qsbk(url): 6 # header={ 7 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 8 # 'Accept-Encoding': 'gzip, deflate, sdch', 9 # 'Accept-Language': 'zh-CN,zh;q=0.8', 10 # 'Cache-Control': 'max-age=0', 11 # 'Connection': 'keep-alive', 12 # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.235' 13 # } 14 # rep=requests.get(url,headers=header) 15 # html=rep.text 16 # bs=BeautifulSoup(html,"html.parser") 17 # body=bs.body #获取html文件的body部分 18 # data=body.find_all("div",{"class":"content"}) #此时的他为set 类型 19 # for joke in data: 20 # joke_duan=joke.find("span") 21 # if "<br/>" not in str(joke_duan): #如果段子中有<br/>,则string会变为None 22 # print(joke_duan.string) 23 # print("") 24 # # with open("joke.txt","w") as f: 25 # # f.write(joke_duan.string) 26 html=request.get(url,3) 27 dz=BeautifulSoup(html.text,"html.parser").find_all("div",{"class":"content"}) #获取一个集合 28 # print(dz) 29 # print(len(dz)) 30 for joke in dz: #joke为一段html代码 31 duanzi=joke.get_text() 32 print(duanzi) 33 34 if __name__=="__main__": 35 url="http://www.qiushibaike.com/" 36 qsbk(url)
上述抓取例子有两个,第一个为没有启用反爬虫策略,第二个为启用了反爬虫策略