爬虫之Python自带的urllib库

一.urllib库

　　urllib是Python自带的一个用于爬虫的库，其主要作用就是可以通过代码模拟浏览器发送请求。其常被用到的子模块在Python3中的为urllib.request和urllib.parse，在Python2中是urllib和urllib2。

'''
#出版社爬取
import urllib.request
import re
data=urllib.request.urlopen("https://read.douban.com/provider/all").read().decode("utf-8")
pat='<div class="name">(.*?)</div>'
rst=re.compile(pat).findall(data)
fh=open("D:\\chubanshe.txt","w")
for i in range(0,len(rst)):
    print(rst[i])
    fh.write(rst[i]+"\n")
fh.close()
'''
'''


#urllib基础
import urllib.request
#urlretrieve(网址,本地文件存储地址) 直接下载网页到本地
urllib.request.urlretrieve("http://www.baidu.com","D:\\dld.html")
urllib.request.urlcleanup()
#看网页相应的简介信息info()
file=urllib.request.urlopen("https://read.douban.com/provider/all")
print(file.info())
#返回网页爬取的状态码getcode()
print(file.getcode())
#获取当前访问的网页的url，geturl()
print(file.geturl())
'''
'''



#超时设置
import urllib.request
for i in range(0,100):
    try:
        file=urllib.request.urlopen("http://www.baidu.com",timeout=1)
        print(len(file.read().decode("utf-8")))
    except Exception as err:
        print("出现异常"+str(err))

'''
'''



#get请求实战--实现百度信息自动搜索
import urllib.request,re
keywd="韦玮"
keywd=urllib.request.quote(keywd)
#page=(num-1)*10
for i in range(1,11):
    url="http://www.baidu.com/s?wd="+keywd+"&pn="+str((i-1)*10)
    data=urllib.request.urlopen(url).read().decode("utf-8")
    pat="title:'(.*?)',"
    pat2='"title":"(.*?)",'
    rst1=re.compile(pat).findall(data)
    rst2=re.compile(pat2).findall(data)
    for j in range(0,len(rst1)):
        print(rst1[j])
    for z in range(0,len(rst2)):
        print(rst2[z])

'''
'''




#post请求实战
import urllib.request
import urllib.parse
posturl="http://www.iqianyue.com/mypost/"
postdata=urllib.parse.urlencode({
    "name":"ceo@txk7.com",
    "pass":"kjsahgjkashg",
    }).encode("utf-8")
#进行post，就需要使用urllib.request下面的Request(真实post地址,post数据)
req=urllib.request.Request(posturl,postdata)
rst=urllib.request.urlopen(req).read().decode("utf-8")
fh=open("D:\\post.html","w")
fh.write(rst)
fh.close()
'''



#异常处理
'''
URLError出现的原因：
1）连不上服务器
2）远程url不存在
3）无网络
4）触发HTTPError
'''
'''
import urllib.request
import urllib.error
try:
    urllib.request.urlopen("http://blog.csdn.net")
except urllib.error.URLError as e:
    if hasattr(e,"code"):
        print(e.code)
    if hasattr(e,"reason"):
        print(e.reason)
'''
'''


#浏览器伪装
import urllib.request
url="http://blog.csdn.net"
#头文件格式header=("User-Agent",具体用户代理值)
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
data=opener.open(url).read()
fh=open("D:\\ua.html","wb")
fh.write(data)
fh.close()


#爬取腾讯新闻首页所有新闻内容
'''
1、爬取新闻首页
2、得到各新闻链接
3、爬取新闻链接
4、寻找有没有frame
5、若有，抓取frame下对应网页内容
6、若没有，直接抓取当前页面
'''
import urllib.request
import re
url="http://news.qq.com/"
data=urllib.request.urlopen(url).read().decode("UTF-8","ignore")
pat1='<a target="_blank" class="linkto" href="(.*?)"'
alllink=re.compile(pat1).findall(data)
for i in range(0,len(alllink)):
    thislink=alllink[i]
    thispage=urllib.request.urlopen(thislink).read().decode("gb2312","ignore")
    pat2="<frame src=(.*?)>"
    isframe=re.compile(pat2).findall(thispage)
    if(len(isframe)==0):
        #直接爬
        print(i)
        urllib.request.urlretrieve(thislink,"D:\\data\\"+str(i)+".html")
    else:
        #得到frame的网址爬
        flink=isframe[0]
        urllib.request.urlretrieve(flink,"D:\\data\\"+str(i)+".html")




'''
#CSDN博文爬虫
import urllib.request
import re
url="http://blog.csdn.net/"
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
#安装为全局
urllib.request.install_opener(opener)
data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
pat='<h3  class="tracking-ad" data-mod="popu_254"><a href="(.*?)"'
alllink=re.compile(pat).findall(data)
#print(alllink)
for i in range(0,len(alllink)):
    localpath="D:\\我的教学\\Python\\韬云教育-腾讯-Python爬虫\\rst\\"+str(i)+".html"
    thislink=alllink[i]
    urllib.request.urlretrieve(thislink,filename=localpath)
    print("当前文章(第"+str(i)+"篇）爬取成功！")
    
'''




'''
#糗事百科段子爬虫
import urllib.request
import re
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
#安装为全局
urllib.request.install_opener(opener)
for i in range(0,35):
    thisurl="http://www.qiushibaike.com/8hr/page/"+str(i+1)+"/?s=4948859"
    data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
    pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
    rst=re.compile(pat,re.S).findall(data)
    for j in range(0,len(rst)):
        print(rst[j])
        print("-------")

'''




'''
#用户代理池的构建
import urllib.request
import re
import random
uapools=[
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    ]

def ua(uapools):
    thisua=random.choice(uapools)
    print(thisua)
    headers=("User-Agent",thisua)
    opener=urllib.request.build_opener()
    opener.addheaders=[headers]
    #安装为全局
    urllib.request.install_opener(opener)

for i in range(0,35):
    ua(uapools)
    thisurl="http://www.qiushibaike.com/8hr/page/"+str(i+1)+"/?s=4948859"
    data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
    pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
    rst=re.compile(pat,re.S).findall(data)
    for j in range(0,len(rst)):
        print(rst[j])
        print("-------")
'''



'''
#IP代理的构建实战
import urllib.request
ip="68.13.196.233:8080"
proxy=urllib.request.ProxyHandler({"http":ip})
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
url="http://www.baidu.com"
data1=urllib.request.urlopen(url).read()
data=data1.decode("utf-8","ignore")
print(len(data))
fh=open("D:\\我的教学\\Python\\韬云教育-腾讯-Python爬虫\\rst\\ip_baidu.html","wb")
fh.write(data1)
fh.close()
'''

'''
#IP代理池构建的第一种方案(适合于代理IP稳定的情况)
import random
import urllib.request
ippools=[
    "68.13.196.233:8080",
    "112.247.100.200:9999",
    "112.247.5.22:9999",
    ]
def ip(ippools):
    thisip=random.choice(ippools)
    print(thisip)
    proxy=urllib.request.ProxyHandler({"http":thisip})
    opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)

for i in range(0,5):
    try:
        ip(ippools)
        url="http://www.baidu.com"
        data1=urllib.request.urlopen(url).read()
        data=data1.decode("utf-8","ignore")
        print(len(data))
        fh=open("D:\\我的教学\\Python\\韬云教育-腾讯-Python爬虫\\rst\\ip_baidu_"+str(i)+".html","wb")
        fh.write(data1)
        fh.close()
    except Exception as err:
        print(err)
'''
'''
#IP代理池实现的第二种方式（接口调用法，这种方法更适合于代理IP不稳定的情况）
import urllib.request
def api():
    print("这一次调用了接口")
    thisall=urllib.request.urlopen("http://tvp.daxiangdaili.com/ip/?tid=559126871522487&num=10&foreign=only")
    ippools=[]
    for item in thisall:
        ippools.append(item.decode("utf-8","ignore"))
    return ippools
def ip(ippools,time):
    thisip=ippools[time]
    print("当前用的IP是："+ippools[time])
    proxy=urllib.request.ProxyHandler({"http":thisip})
    opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    
x=0
for i in range(0,35):
    try:
        if(x%10==0):
            time=x%10
            ippools=api()
            ip(ippools,time)
        else:
            time=x%10
            ip(ippools,time)
        url="http://www.baidu.com"
        data1=urllib.request.urlopen(url).read()
        data=data1.decode("utf-8","ignore")
        print(len(data))
        fh=open("D:\\我的教学\\Python\\韬云教育-腾讯-Python爬虫\\rst\\ip_baidu_"+str(i)+".html","wb")
        fh.write(data1)
        fh.close()
        x+=1
    except Exception as err:
        print(err)
        x+=1
'''


#淘宝商品图片爬虫
import urllib.request
import re
import random
keyname="维维豆奶"
key=urllib.request.quote(keyname)
uapools=[
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    ]

def ua(uapools):
    thisua=random.choice(uapools)
    print(thisua)
    headers=("User-Agent",thisua)
    opener=urllib.request.build_opener()
    opener.addheaders=[headers]
    #安装为全局
    urllib.request.install_opener(opener)
for i in range(1,101):
    url="https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
    ua(uapools)
    data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
    pat='"pic_url":"//(.*?)"'
    imglist=re.compile(pat).findall(data)
    for j in range(0,len(imglist)):
        thisimg=imglist[j]
        thisimgurl="http://"+thisimg
        localfile="D:\\我的教学\\Python\\韬云教育-腾讯-Python爬虫\\rst\\taobao\\dounai\\"+str(i)+str(j)+".jpg"
        urllib.request.urlretrieve(thisimgurl,filename=localfile)
        




#如何同时使用用户代理池和IP代理池
def ua_ip(myurl):
    import random
    uapools=[
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
        ]
    import urllib.request
    def api():
        print("这一次调用了接口")
        thisall=urllib.request.urlopen("http://tvp.daxiangdaili.com/ip/?tid=559126871522487&num=10&foreign=only&filter=on")
        ippools=[]
        for item in thisall:
            ippools.append(item.decode("utf-8","ignore"))
        return ippools
    def ip(ippools,time,uapools):
        thisua=random.choice(uapools)
        print(thisua)
        headers=("User-Agent",thisua)
        thisip=ippools[time]
        print("当前用的IP是："+ippools[time])
        proxy=urllib.request.ProxyHandler({"http":thisip})
        opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
        opener.addheaders=[headers]
        urllib.request.install_opener(opener)
        
    x=0
    for i in range(0,35):
        try:
            if(x%10==0):
                time=x%10
                ippools=api()
                ip(ippools,time,uapools)
            else:
                time=x%10
                ip(ippools,time,uapools)
            url=myurl
            data1=urllib.request.urlopen(url).read()
            data=data1.decode("utf-8","ignore")
            print(len(data))
            #fh=open("D:\\我的教学\\Python\\韬云教育-腾讯-Python爬虫\\rst\\ip_baidu_"+str(i)+".html","wb")
            #fh.write(data1)
            #fh.close()
            x+=1
            break
        except Exception as err:
            print(err)
            x+=1
    return data
#data=ua_ip("http://www.baidu.com")

urllib基础

urlopen函数：

在Python3的urllib库中，所有和网络请求相关的方法，都被集到urllib.request模块下面了，以先来看下urlopen函数基本的使用：

from urllib import request
resp = request.urlopen('http://www.baidu.com')
print(resp.read())

实际上，使用浏览器访问百度，右键查看源代码。你会发现，跟我们刚才打印出来的数据是一模一样的。也就是说，上面的三行代码就已经帮我们把百度的首页的全部代码爬下来了。一个基本的url请求对应的python代码真的非常简单。
以下对urlopen函数的进行详细讲解：

url：请求的url。
data：请求的data，如果设置了这个值，那么将变成post请求。
返回值：返回值是一个http.client.HTTPResponse对象，这个对象是一个类文件句柄对象。有read(size)、readline、readlines以及getcode等方法。

urlretrieve函数：

这个函数可以方便的将网页上的一个文件保存到本地。以下代码可以非常方便的将百度的首页下载到本地：

from urllib import request
request.urlretrieve('http://www.baidu.com/','baidu.html')

urlencode函数：

用浏览器发送请求的时候，如果url中包含了中文或者其他特殊字符，那么浏览器会自动的给我们进行编码。而如果使用代码发送请求，那么就必须手动的进行编码，这时候就应该使用urlencode函数来实现。urlencode可以把字典数据转换为URL编码的数据。示例代码如下：

from urllib import parse
data = {'name':'爬虫基础','greet':'hello world','age':100}
qs = parse.urlencode(data)
print(qs)

parse_qs函数：

可以将经过编码后的url参数进行解码。示例代码如下：

from urllib import parse
qs = "name=%E7%88%AC%E8%99%AB%E5%9F%BA%E7%A1%80&greet=hello+world&age=100"
print(parse.parse_qs(qs))

urlparse和urlsplit：

有时候拿到一个url，想要对这个url中的各个组成部分进行分割，那么这时候就可以使用urlparse或者是urlsplit来进行分割。示例代码如下：

from urllib import request,parse

url = 'http://www.baidu.com/s?username=zhiliao'

result = parse.urlsplit(url)
# result = parse.urlparse(url)

print('scheme:',result.scheme)
print('netloc:',result.netloc)
print('path:',result.path)
print('query:',result.query)

urlparse和urlsplit基本上是一模一样的。唯一不一样的地方是，urlparse里面多了一个params属性，而urlsplit没有这个params属性。比如有一个url为：url = 'http://www.baidu.com/s;hello?wd=python&username=abc#1'，
那么urlparse可以获取到hello，而urlsplit不可以获取到。url中的params也用得比较少。

urllib.parse.urljoin

from urllib import parse

url = 'http://www.baidu.com/'
img_url = '/123.jpg'
new_url = parse.urljoin(url, img_url)
print(new_url)  # http://www.baidu.com/123.jpg

urllib.parse.quote

from urllib import parse

wd = '快代理'
quote_wd = parse.quote(wd)
print(quote_wd)  # %E5%BF%AB%E4%BB%A3%E7%90%86

#解回来
base_wd = parse.unquote(quote_wd)
print(base_wd)  # 快代理

request.Request类：

如果想要在请求的时候增加一些请求头，那么就必须使用request.Request类来实现。比如要增加一个User-Agent，示例代码如下：

from urllib import request

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
req = request.Request("http://www.baidu.com/",headers=headers)
resp = request.urlopen(req)
print(resp.read())

ProxyHandler处理器（代理设置）

很多网站会检测某一段时间某个IP的访问次数(通过流量统计，系统日志等)，如果访问次数多的不像正常人，它会禁止这个IP的访问。
所以我们可以设置一些代理服务器，每隔一段时间换一个代理，就算IP被禁止，依然可以换个IP继续爬取。
urllib中通过ProxyHandler来设置使用代理服务器，下面代码说明如何使用自定义opener来使用代理：

from urllib import request

# 这个是没有使用代理的
# resp = request.urlopen('http://httpbin.org/get')
# print(resp.read().decode("utf-8"))

# 这个是使用了代理的
handler = request.ProxyHandler({"http":"218.66.161.88:31769"})

opener = request.build_opener(handler)
req = request.Request("http://httpbin.org/ip")
resp = opener.open(req)
print(resp.read())

常用的代理有：

快代理：http://www.kuaidaili.com/
代理云：http://www.dailiyun.com/

什么是cookie：

在网站中，http请求是无状态的。也就是说即使第一次和服务器连接后并且登录成功后，第二次请求服务器依然不能知道当前请求是哪个用户。cookie的出现就是为了解决这个问题，第一次登录后服务器返回一些数据（cookie）给浏览器，然后浏览器保存在本地，当该用户发送第二次请求的时候，就会自动的把上次请求存储的cookie数据自动的携带给服务器，服务器通过浏览器携带的数据就能判断当前用户是哪个了。cookie存储的数据量有限，不同的浏览器有不同的存储大小，但一般不超过4KB。因此使用cookie只能存储一些小量的数据。

cookie的格式：

Set-Cookie: NAME=VALUE；Expires/Max-age=DATE；Path=PATH；Domain=DOMAIN_NAME；SECURE

参数意义：

NAME：cookie的名字。
VALUE：cookie的值。
Expires：cookie的过期时间。
Path：cookie作用的路径。
Domain：cookie作用的域名。
SECURE：是否只在https协议下起作用。

使用cookielib库和HTTPCookieProcessor模拟登录：

Cookie 是指网站服务器为了辨别用户身份和进行Session跟踪，而储存在用户浏览器上的文本文件，Cookie可以保持登录信息到用户下次与服务器的会话。
这里以人人网为例。人人网中，要访问某个人的主页，必须先登录才能访问，登录说白了就是要有cookie信息。那么如果我们想要用代码的方式访问，就必须要有正确的cookie信息才能访问。解决方案有两种，第一种是使用浏览器访问，然后将cookie信息复制下来，放到headers中。示例代码如下：

from urllib import request

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
    'Cookie': 'anonymid=jacdwz2x-8bjldx; depovince=GW; _r01_=1; _ga=GA1.2.1455063316.1511436360; _gid=GA1.2.862627163.1511436360; wp=1; JSESSIONID=abczwY8ecd4xz8RJcyP-v; jebecookies=d4497791-9d41-4269-9e2b-3858d4989785|||||; ick_login=884e75d4-f361-4cff-94bb-81fe6c42b220; _de=EA5778F44555C091303554EBBEB4676C696BF75400CE19CC; p=61a3c7d0d4b2d1e991095353f83fa2141; first_login_flag=1; ln_uact=970138074@qq.com; ln_hurl=http://hdn.xnimg.cn/photos/hdn121/20170428/1700/main_nhiB_aebd0000854a1986.jpg; t=3dd84a3117737e819dd2c32f1cdb91d01; societyguester=3dd84a3117737e819dd2c32f1cdb91d01; id=443362311; xnsid=169efdc0; loginfrom=syshome; ch_id=10016; jebe_key=9c062f5a-4335-4a91-bf7a-970f8b86a64e%7Ca022c303305d1b2ab6b5089643e4b5de%7C1511449232839%7C1; wp_fold=0'
}

url = 'http://www.renren.com/880151247/profile'

req = request.Request(url,headers=headers)
resp = request.urlopen(req)
with open('renren.html','w') as fp:
    fp.write(resp.read().decode('utf-8'))

但是每次在访问需要cookie的页面都要从浏览器中复制cookie比较麻烦。在Python处理Cookie，一般是通过http.cookiejar模块和urllib模块的HTTPCookieProcessor处理器类一起使用。http.cookiejar模块主要作用是提供用于存储cookie的对象。而HTTPCookieProcessor处理器主要作用是处理这些cookie对象，并构建handler对象。

http.cookiejar模块：

该模块主要的类有CookieJar、FileCookieJar、MozillaCookieJar、LWPCookieJar。这四个类的作用分别如下：

CookieJar：管理HTTP cookie值、存储HTTP请求生成的cookie、向传出的HTTP请求添加cookie的对象。整个cookie都存储在内存中，对CookieJar实例进行垃圾回收后cookie也将丢失。
FileCookieJar (filename,delayload=None,policy=None)：从CookieJar派生而来，用来创建FileCookieJar实例，检索cookie信息并将cookie存储到文件中。filename是存储cookie的文件名。delayload为True时支持延迟访问访问文件，即只有在需要时才读取文件或在文件中存储数据。
MozillaCookieJar (filename,delayload=None,policy=None)：从FileCookieJar派生而来，创建与Mozilla浏览器 cookies.txt兼容的FileCookieJar实例。
LWPCookieJar (filename,delayload=None,policy=None)：从FileCookieJar派生而来，创建与libwww-perl标准的 Set-Cookie3 文件格式兼容的FileCookieJar实例。

登录人人网：

利用http.cookiejar和request.HTTPCookieProcessor登录人人网。相关示例代码如下：

from urllib import request,parse
from http.cookiejar import CookieJar

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}

def get_opener():
    cookiejar = CookieJar()
    handler = request.HTTPCookieProcessor(cookiejar)
    opener = request.build_opener(handler)
    return opener

def login_renren(opener):
    data = {"email": "970138074@qq.com", "password": "pythonspider"}
    data = parse.urlencode(data).encode('utf-8')
    login_url = "http://www.renren.com/PLogin.do"
    req = request.Request(login_url, headers=headers, data=data)
    opener.open(req)

def visit_profile(opener):
    url = 'http://www.renren.com/880151247/profile'
    req = request.Request(url,headers=headers)
    resp = opener.open(req)
    with open('renren.html','w') as fp:
        fp.write(resp.read().decode("utf-8"))

if __name__ == '__main__':
    opener = get_opener()
    login_renren(opener)
    visit_profile(opener)

View Code

保存cookie到本地：

保存cookie到本地，可以使用cookiejar的save方法，并且需要指定一个文件名：

from urllib import request
from http.cookiejar import MozillaCookieJar

cookiejar = MozillaCookieJar("cookie.txt")
handler = request.HTTPCookieProcessor(cookiejar)
opener = request.build_opener(handler)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
req = request.Request('http://httpbin.org/cookies',headers=headers)

resp = opener.open(req)
print(resp.read())
cookiejar.save(ignore_discard=True,ignore_expires=True)

View Code

从本地加载cookie：

从本地加载cookie，需要使用cookiejar的load方法，并且也需要指定方法：

from urllib import request
from http.cookiejar import MozillaCookieJar

cookiejar = MozillaCookieJar("cookie.txt")
cookiejar.load(ignore_expires=True,ignore_discard=True)
handler = request.HTTPCookieProcessor(cookiejar)
opener = request.build_opener(handler)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
req = request.Request('http://httpbin.org/cookies',headers=headers)

resp = opener.open(req)
print(resp.read())

View Code

二.由易到难的爬虫程序：

　　1.爬取百度首页面所有数据值

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
#导包
import urllib.request
import urllib.parse
if __name__ == "__main__":
    #指定爬取的网页url
    url = 'http://www.baidu.com/'
    #通过urlopen函数向指定的url发起请求，返回响应对象
    reponse = urllib.request.urlopen(url=url)
    #通过调用响应对象中的read函数，返回响应回客户端的数据值（爬取到的数据）
    data = reponse.read()#返回的数据为byte类型，并非字符串
    print(data)#打印显示爬取到的数据值。

View Code

#补充说明
urlopen函数原型：urllib.request.urlopen(url, data=None, timeout=<object object at 0x10af327d0>, *, cafile=None, capath=None, cadefault=False, context=None)

在上述案例中我们只使用了该函数中的第一个参数url。在日常开发中，我们能用的只有url和data这两个参数。

url参数：指定向哪个url发起请求
data参数：可以将post请求中携带的参数封装成字典的形式传递给该参数（暂时不需要理解，后期会讲）

urlopen函数返回的响应对象，相关函数调用介绍：
response.headers()：获取响应头信息
response.getcode()：获取响应状态码
response.geturl()：获取请求的url
response.read()：获取响应中的数据值（字节类型）

补充说明

　　2.将爬取到百度新闻首页的数据值写入文件进行存储

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
import urllib.request
import urllib.parse
if __name__ == "__main__":
    url = 'http://news.baidu.com/'
    reponse = urllib.request.urlopen(url=url)
    #decode()作用是将响应中字节（byte）类型的数据值转成字符串类型
    data = reponse.read().decode()
    #使用IO操作将data表示的数据值以'w'权限的方式写入到news.html文件中
    with open('./news.html','w') as fp:
        fp.write(data)
    print('写入文件完毕')

View Code

　　3.爬取网络上某张图片数据，且存储到本地

import urllib
url = 'https://www.baidu.com/img/bd_logo1.png'
urllib.request.urlretrieve(url=url,filename='./xxx.jpg')

View Code

　　4.url的特性：url必须为ASCII编码的数据值。所以我们在爬虫代码中编写url时，如果url中存在非ASCII编码的数据值，则必须对其进行ASCII编码后，该url方可被使用。

　　案例：爬取使用百度根据指定词条搜索到的页面数据（例如爬取词条为‘周杰伦’的页面数据）

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
import urllib.request
import urllib.parse

if __name__ == "__main__":
    #原始url中存在非ASCII编码的值，则该url无法被使用。
    #url = 'http://www.baidu.com/s?ie=utf-8&kw=周杰伦'
    #处理url中存在的非ASCII数据值
    url = 'http://www.baidu.com/s?'
    #将带有非ASCII的数据封装到字典中，url中非ASCII的数据往往都是'?'后面键值形式的请求参数
    param = {
        'ie':'utf-8',
        'wd':'周杰伦'
    }
    #使用parse子模块中的urlencode函数将封装好的字典中存在的非ASCII的数值进行ASCII编码
    param = urllib.parse.urlencode(param)
    #将编码后的数据和url进行整合拼接成一个完整可用的url
    url = url + param
    print(url)
    response = urllib.request.urlopen(url=url)
    data = response.read()
    with open('./周杰伦.html','wb') as fp:
        fp.write(data)
    print('写入文件完毕')

View Code

　　5.通过自定义请求对象，用于伪装爬虫程序请求的身份。

　　　　之前在讲解http常用请求头信息时，我们讲解过User-Agent参数，简称为UA，该参数的作用是用于表明本次请求载体的身份标识。如果我们通过浏览器发起的请求，则该请求的载体为当前浏览器，则UA参数的值表明的是当前浏览器的身份标识表示的一串数据。如果我们使用爬虫程序发起的一个请求，则该请求的载体为爬虫程序，那么该请求的UA为爬虫程序的身份标识表示的一串数据。有些网站会通过辨别请求的UA来判别该请求的载体是否为爬虫程序，如果为爬虫程序，则不会给该请求返回响应，那么我们的爬虫程序则也无法通过请求爬取到该网站中的数据值，这也是反爬虫的一种初级技术手段。那么为了防止该问题的出现，则我们可以给爬虫程序的UA进行伪装，伪装成某款浏览器的身份标识。

　　　　上述案例中，我们是通过request模块中的urlopen发起的请求，该请求对象为urllib中内置的默认请求对象，我们无法对其进行UA进行更改操作。urllib还为我们提供了一种自定义请求对象的方式，我们可以通过自定义请求对象的方式，给该请求对象中的UA进行伪装（更改）操作。

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
import urllib.request
import urllib.parse

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

if __name__ == "__main__":
    #原始url中存在非ASCII编码的值，则该url无法被使用。
    #url = 'http://www.baidu.com/s?ie=utf-8&kw=周杰伦'
    #处理url中存在的非ASCII数据值
    url = 'http://www.baidu.com/s?'
    #将带有非ASCII的数据封装到字典中，url中非ASCII的数据往往都是'?'后面键值形式的请求参数
    param = {
        'ie':'utf-8',
        'wd':'周杰伦'
    }
    #使用parse子模块中的urlencode函数将封装好的字典中存在的非ASCII的数值进行ASCII编码
    param = urllib.parse.urlencode(param)
    #将编码后的数据和url进行整合拼接成一个完整可用的url
    url = url + param
    #将浏览器的UA数据获取，封装到一个字典中。该UA值可以通过抓包工具或者浏览器自带的开发者工具中获取某请求，从中获取UA的值
    headers={
        'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    #自定义一个请求对象
    #参数：url为请求的url。headers为UA的值。data为post请求的请求参数（后面讲）
    request = urllib.request.Request(url=url,headers=headers)

    #发送我们自定义的请求（该请求的UA已经进行了伪装）
    response = urllib.request.urlopen(request)

    data=response.read()

    with open('./周杰伦.html','wb') as fp:
        fp.write(data)
    print('写入数据完毕')

View Code

posted on 2020-01-14 22:26 始终不够啊阅读(341) 评论(0) 编辑收藏举报

刷新页面返回顶部

爬虫之Python自带的urllib库

urlopen函数：

urlretrieve函数：

urlencode函数：

parse_qs函数：

urlparse和urlsplit：

urllib.parse.urljoin

request.Request类：

ProxyHandler处理器（代理设置）

什么是cookie：

cookie的格式：

使用cookielib库和HTTPCookieProcessor模拟登录：

http.cookiejar模块：

登录人人网：

保存cookie到本地：

从本地加载cookie：

公告