爬虫知识点
user-agent和proxy代码:
# -*- coding:utf-8 -*- import random import re import urllib2 import urllib import time count=0 user_agent_list=[ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ', 'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', ] def Get_proxy_ip(): headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'} url="http://www.xicidaili.com/nn/"#通过网址的翻页还可以爬取更多的代理网址 req=urllib2.Request(url,headers=headers) response=urllib2.urlopen(req) content=response.read().decode("utf-8") ip_pattern=re.compile(r"\d+\.\d+\.\d+\.\d+") port_pattern=re.compile(r"<td>\d+</td>") ip_list=re.findall(ip_pattern,content) port_list=re.findall(port_pattern,content) proxy_list=[] for i in range(len(ip_list)): ip=ip_list[i].encode('utf-8') port=re.sub(r'<td>|</td>','',port_list[i]).encode('utf-8') proxy="%s:%s"%(ip,port) proxy_list.append(proxy) return proxy_list def proxy_test(user_agent_list,proxy_list,i): proxy_ip=proxy_list[i] print"current proxy:%s"%proxy_ip user_agent=random.choice(user_agent_list) print"current user_agent:%s" % user_agent sleep_time=random.randint(1,3) print"current sleep_time :%s" % sleep_time time.sleep(sleep_time) print"开始测试" headers = { 'User-Agent': user_agent} url = "http://cuiqingcai.com/1052.html" proxy_support=urllib2.ProxyHandler({'http':proxy_ip}) opener=urllib2.build_opener(proxy_support) urllib2.install_opener(opener) req = urllib2.Request(url, headers=headers) try: response = urllib2.urlopen(req).read().decode("utf-8") except Exception,e: print"failed" else: global count count += 1 print('OK!总计成功%s次!' % count) if __name__=="__main__": proxy_list=Get_proxy_ip() for i in range(15): proxy_test(user_agent_list, proxy_list, i)
(proxy代理网站地址更新后代码如下:)
#coding: utf-8 import urllib2 import re import random import ssl # 新地址 https://www.kuaidaili.com/free/ def get_proxy_ip(): context = ssl._create_unverified_context() #访问https站点时需要 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'} url="https://www.kuaidaili.com/free/"#通过网址的翻页还可以爬取更多的代理网址 req=urllib2.Request(url,headers=headers) response=urllib2.urlopen(req,context=context) content=response.read().decode("utf-8") ip_pattern=re.compile(r"\d+\.\d+\.\d+\.\d+") port_pattern=re.compile(r'<td data-title="PORT">(\d+)</td>') type_pattern = re.compile(r'<td data-title=.*>(HTTP[S]{0,1})</td>') ip_list=re.findall(ip_pattern,content) port_list=re.findall(port_pattern,content) type_list = re.findall(type_pattern,content) #print ip_list,port_list,type_list proxy_list=[] for i in range(len(ip_list)): ip=ip_list[i].encode('utf-8') port=port_list[i].encode('utf-8') type = type_list[i].encode('utf-8').lower() proxy="%s:%s"%(ip,port) proxy_list.append({type:proxy}) return random.choice(proxy_list) def get_user_agent(): user_agent_list = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ', 'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', ] user_agent = random.choice(user_agent_list) return user_agent if __name__ == '__main__': proxy_ip = get_proxy_ip() user_agent = get_user_agent() print proxy_ip, user_agent
验证码识别代码:
# -*- coding:utf-8 -*- import cStringIO from pytesser import * from PIL import Image import urllib2 import os gradu_captcha='http://gradinfo.cau.edu.cn/getCaptcha.do' response=urllib2.urlopen(gradu_captcha).read() img = cStringIO.StringIO(response)#格式转换 im=Image.open(img) #im.show() os.chdir('D:\Python\Lib\site-packages')#必须加上,否则系统找不到pytesser路径 text=image_to_string(im) print text #验证发现研究生管理系统的验证码能够识别,而我的图书馆的验证码不能识别
验证码手动输入:
# -*- coding:utf-8 -*- import cookielib import urllib import urllib2 #登陆我的图书馆思路:先登录验证码页面获得验证码和cookie,再进入登陆页面模拟登陆 captcha_url='http://mylib.cau.edu.cn/reader/captcha.php' login_url='http://mylib.cau.edu.cn/reader/redr_verify.php' cookie=cookielib.MozillaCookieJar('D:\PyCharm\cookie.txt') opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) urllib2.install_opener(opener) #下面进入验证码页面获得验证码 req=urllib2.Request(captcha_url) response=urllib2.urlopen(req).read() imgfile='d:/captcha.jpg' picture=open(imgfile,'wb') picture.write(response) picture.close() security_code=raw_input("请输入验证码") #构造post和headers,模拟登陆(注意cookie) postdata={ "number":"s20153101096", "passwd": "*******", "captcha": security_code, "select": "bar_no", 'returnUrl':'' } headers={ "User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:49.0) Gecko/20100101 Firefox/49.0", 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' , 'Accept-Encoding':' gzip, deflate', 'Connection':'keep-alive', 'Upgrade-Insecure-Requests':'1' } data=urllib.urlencode(postdata) req2=urllib2.Request(login_url,data,headers) try: result=urllib2.urlopen(req2).read() except urllib2.HTTPError,e: print e.code cookie.save(ignore_discard=True,ignore_expires=True) print result
(文字反爬)fontTools库
将base64格式的字体信息解码成字体文件:https://blog.csdn.net/huiyinimen/article/details/83444636
(base64解码网站:https://blog.csdn.net/huiyinimen/article/details/83444636)
猫眼电影文字反爬:https://zhuanlan.zhihu.com/p/33112359
https://www.jianshu.com/p/0e2e1aa6d270
58品牌公寓文字反爬:https://blog.csdn.net/m0_37156322/article/details/84658872
图片防盗链接(referrer):
就是我希望在自己的页面里用
<img src="xxxx" />
来引用其他网站的一张图片,但是网站设置了防盗链的策略,会在后台判断请求的Referrer
属性是不是来自于一个非本域名的网站,如果来源不是本域名就返回403 forbidden. (
<meta name="referrer" content="never" />
)
https://juejin.im/entry/5adaa72c6fb9a07aa43bc665
https://bindog.github.io/blog/2014/11/18/http-referer-security-and-anti-anti-hotlink/
bloom filter: Bloom filter 是一个数据结构,它可以用来判断某个元素是否在集合内,具有运行快速,内存占用小的特点。 而高效插入和查询的代价就是 Bloom Filter 是一个概率数据结构: 它可以告诉我们一个元素绝对不在集合内或者可能在集合内.
https://blog.csdn.net/preyta/article/details/72970887
https://stackoverflow.com/questions/311202/modern-high-performance-bloom-filter-in-python