爬虫知识点

user-agent和proxy代码：

# -*- coding:utf-8 -*-
import random
import re
import urllib2
import urllib
import time

count=0
user_agent_list=[
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ',
    'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
    'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
    'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
    'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
]
def Get_proxy_ip():
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}
    url="http://www.xicidaili.com/nn/"#通过网址的翻页还可以爬取更多的代理网址
    req=urllib2.Request(url,headers=headers)
    response=urllib2.urlopen(req)
    content=response.read().decode("utf-8")
    ip_pattern=re.compile(r"\d+\.\d+\.\d+\.\d+")
    port_pattern=re.compile(r"<td>\d+</td>")
    ip_list=re.findall(ip_pattern,content)
    port_list=re.findall(port_pattern,content)
    proxy_list=[]
    for i in range(len(ip_list)):
        ip=ip_list[i].encode('utf-8')
        port=re.sub(r'<td>|</td>','',port_list[i]).encode('utf-8')
        proxy="%s:%s"%(ip,port)
        proxy_list.append(proxy)
    return proxy_list
def proxy_test(user_agent_list,proxy_list,i):
    proxy_ip=proxy_list[i]
    print"current proxy:%s"%proxy_ip
    user_agent=random.choice(user_agent_list)
    print"current user_agent:%s" % user_agent
    sleep_time=random.randint(1,3)
    print"current sleep_time :%s" % sleep_time
    time.sleep(sleep_time)
    print"开始测试"
    headers = {
        'User-Agent': user_agent}
    url = "http://cuiqingcai.com/1052.html"
    proxy_support=urllib2.ProxyHandler({'http':proxy_ip})
    opener=urllib2.build_opener(proxy_support)
    urllib2.install_opener(opener)
    req = urllib2.Request(url, headers=headers)
    try:
        response = urllib2.urlopen(req).read().decode("utf-8")
    except Exception,e:
        print"failed"
    else:
        global count
        count += 1
        print('OK!总计成功%s次！' % count)

if __name__=="__main__":
    proxy_list=Get_proxy_ip()
    for i in range(15):
        proxy_test(user_agent_list, proxy_list, i)

View Code

（proxy代理网站地址更新后代码如下：）

#coding: utf-8

import urllib2
import re
import random
import ssl


# 新地址 https://www.kuaidaili.com/free/
def get_proxy_ip():
    context = ssl._create_unverified_context()  #访问https站点时需要
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}
    url="https://www.kuaidaili.com/free/"#通过网址的翻页还可以爬取更多的代理网址
    req=urllib2.Request(url,headers=headers)
    response=urllib2.urlopen(req,context=context)
    content=response.read().decode("utf-8")
    ip_pattern=re.compile(r"\d+\.\d+\.\d+\.\d+")
    port_pattern=re.compile(r'<td data-title="PORT">(\d+)</td>')
    type_pattern = re.compile(r'<td data-title=.*>(HTTP[S]{0,1})</td>')
    ip_list=re.findall(ip_pattern,content)
    port_list=re.findall(port_pattern,content)
    type_list = re.findall(type_pattern,content)
    #print ip_list,port_list,type_list
    proxy_list=[]
    for i in range(len(ip_list)):
        ip=ip_list[i].encode('utf-8')
        port=port_list[i].encode('utf-8')
        type = type_list[i].encode('utf-8').lower()
        proxy="%s:%s"%(ip,port)
        proxy_list.append({type:proxy})
    return random.choice(proxy_list)


def get_user_agent():
    user_agent_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ',
        'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
    ]
    user_agent = random.choice(user_agent_list)
    return user_agent

if __name__ == '__main__':
    proxy_ip = get_proxy_ip()
    user_agent = get_user_agent()
    print proxy_ip, user_agent

View Code

验证码识别代码：

# -*- coding:utf-8 -*-
import cStringIO
from pytesser import *
from PIL import Image
import urllib2
import os
gradu_captcha='http://gradinfo.cau.edu.cn/getCaptcha.do'
response=urllib2.urlopen(gradu_captcha).read()
img = cStringIO.StringIO(response)#格式转换
im=Image.open(img)
#im.show()
os.chdir('D:\Python\Lib\site-packages')#必须加上，否则系统找不到pytesser路径
text=image_to_string(im)
print text

#验证发现研究生管理系统的验证码能够识别，而我的图书馆的验证码不能识别

View Code

验证码手动输入：

# -*- coding:utf-8 -*-
import cookielib
import urllib
import urllib2
#登陆我的图书馆思路：先登录验证码页面获得验证码和cookie，再进入登陆页面模拟登陆
captcha_url='http://mylib.cau.edu.cn/reader/captcha.php'
login_url='http://mylib.cau.edu.cn/reader/redr_verify.php'
cookie=cookielib.MozillaCookieJar('D:\PyCharm\cookie.txt')
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
urllib2.install_opener(opener)

#下面进入验证码页面获得验证码
req=urllib2.Request(captcha_url)
response=urllib2.urlopen(req).read()
imgfile='d:/captcha.jpg'
picture=open(imgfile,'wb')
picture.write(response)
picture.close()
security_code=raw_input("请输入验证码")

#构造post和headers，模拟登陆（注意cookie）
postdata={
    "number":"s20153101096",
    "passwd": "*******",
    "captcha": security_code,
    "select": "bar_no",
    'returnUrl':''
}
headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:49.0) Gecko/20100101 Firefox/49.0",
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'    ,
    'Accept-Encoding':'    gzip, deflate',
     'Connection':'keep-alive',
    'Upgrade-Insecure-Requests':'1'
}
data=urllib.urlencode(postdata)
req2=urllib2.Request(login_url,data,headers)
try:
    result=urllib2.urlopen(req2).read()
except urllib2.HTTPError,e:
    print e.code
cookie.save(ignore_discard=True,ignore_expires=True)
print result

View Code

（文字反爬）fontTools库

将base64格式的字体信息解码成字体文件：https://blog.csdn.net/huiyinimen/article/details/83444636

（base64解码网站：https://blog.csdn.net/huiyinimen/article/details/83444636）

猫眼电影文字反爬：https://zhuanlan.zhihu.com/p/33112359

　　　　　　　　　https://www.jianshu.com/p/0e2e1aa6d270

58品牌公寓文字反爬：https://blog.csdn.net/m0_37156322/article/details/84658872

图片防盗链接(referrer): 就是我希望在自己的页面里用<img src="xxxx" />来引用其他网站的一张图片，但是网站设置了防盗链的策略，会在后台判断请求的Referrer属性是不是来自于一个非本域名的网站，如果来源不是本域名就返回403 forbidden. (<meta name="referrer" content="never" />)

https://juejin.im/entry/5adaa72c6fb9a07aa43bc665

https://bindog.github.io/blog/2014/11/18/http-referer-security-and-anti-anti-hotlink/

bloom filter: Bloom filter 是一个数据结构，它可以用来判断某个元素是否在集合内，具有运行快速，内存占用小的特点。而高效插入和查询的代价就是 Bloom Filter 是一个概率数据结构: 它可以告诉我们一个元素绝对不在集合内或者可能在集合内.

https://blog.csdn.net/preyta/article/details/72970887

https://stackoverflow.com/questions/311202/modern-high-performance-bloom-filter-in-python

posted @ 2024-09-21 14:40 silence_cho 阅读(14) 评论(0) 编辑收藏举报

刷新页面返回顶部

silence_cho

爬虫知识点

公告