公告

日历

r.url：打印输出该URL
r.headers：以字典对象存储服务器响应头，但该字典比较特殊，字典键不区分大小写，若键不存在则返回None
r.status_code：返回连接状态，200正常
r.text：默认以unicode形式返回网页内容，也就是网页源码的字符串
r.content：以字节形式（二进制）返回。字节方式的响应体，会自动解码gzip和deflate压缩
r.json：将网页中的json数据转成字典并将其返回
r.encoding：获取当前的编码
r.encoding = "utf-8"：指定编码，r.text返回的数据类型，写在r.text之前

POST请求

HTTP协议规定POST提交的数据必须房子消息主体（entity-body）中，但协议并没有规定数据必须使用什么编码方式，服务端是通过根据请求头中的Content-Type字段来获知请求中的消息主体是用何种方式进行编码，再对消息主体进行解析。具体的编码方式包括:

最常见post提交数据的方式，以form表单形式提交数据
```
application/x-www-form-urlencoded
```
以json串提交数据
```
application/json
```
一般用于上传文件
```
multipart/form-data
```

示例如下：

1、以form形式发送post请求

requests支持以form表单形式发送post请求，只需要将请求的参数构造成一个字典，然后传给requests.post()的data参数即可

payload = {
    "key1":"value1",
    "key2":"value2"
}

r = requests.post("http://httpbin.org/post",data=payload)
print(r.text)

'''
"form" : {
	"key1":"value1",
    "key2":"value2"
}
'''

2、以json形式发送post请求

可将一json串传给requests.post()的data参数

url = "http://httpbin.org/post"
payload = {
    "key1":"value1",
    "key2":"value2"
}

r = requests.post(url,data=json.dumps(payload))
print(r.headers.get("Content-Type"))

'''
application/json
'''

3、以multipart形式发送post请求

requests也支持multipart形式发送post请求，只需将一文件传给requests.post()的files参数即可，文本文件report.txt的内容只有一行：Hello World!，从请求的响应结果可以看到数据已上传到服务端中。

url = "http://httpbin.org/post"
files = {
    'file':open("report.txt"."rb")
}

r = reuqests.post(url,files=files)
print(r.text)

'''
{
...
  "files": {
    "file": "hello world"
  }, 
  "form": {}, 
  "headers": {
    "Content-Type": "multipart/form-data; boundary=6db46af64e694661985109da21c8fe9b", 

  }, 
  "json": null, 
  "origin": "223.72.217.138", 
  "url": "http://httpbin.org/post"
  ...
}
'''

示例1：下载单张图片并保存到本地磁盘目录

#!/usr/bin/python3
#coding=utf-8

'''
下载单张图片并保存到本地磁盘目录
'''

import os 
import requests

#下载地址
url = 'https://ss0.bdstatic.com/5aV1bjqh_Q23odCf/static/superman/img/logo/bd_logo1_31bdc765.png'
#保存地址
path = "D://图片//"

#构造下载图片url
dwon = path + url.split("/")[-1]
#D://图片//bd_logo1_31bdc765.png
try:
    #判断目录是否存在
    if not os.path.exists(path):
        os.mkdir(path)
    #如果本地不存在此文件，则开始下载
    if not os.path.exists(down):
        r = requests.get(url)
        print(r)
        #开始写文件，wb表示写二进制文件
        with open(down,"wb") as f:
            #图片以二进制形式保存
            f.write(r.content)
        print("图片下载成功")
    else:
        print("图片已经存在")
except Exception as e:
    print("爬取失败：",str(e))

示例2：使用bs4和requests模块，爬取图片并以1、2、3的形式保存到指定目录

#!/usr/bin/python3
#coding=utf-8

import os 
import requests
from bs4 import BeautifulSoup

#保存地址
path = "D://爬虫专用//"
#爬取地址
url = "http://tieda.baidu.com/p/1753935195"
html_page = requests.get(url)

#创建BeautifulSoup对象
soup = BeautifulSoup(html_page.text,'lxml')
#通过class=“BDE_Image”获取所有的img标签
class_image = soup.findall(attrs={"class":"BDE_Image"})
print(class_image)

#判断目录是否存在
if not os.path.exists(path):
    os.mkdir(path)
try:
    x = 0
    #循环class_image列表，找到所有img标签的链接
    for i in class_image:
        #取出src对应的url地址
        src_url = i.get("src")
        #请求src_url链接地址
        img_list = requests.get(src.url)
        #构造url名称
        down = path + "%s.jpg" % x
        print(down)
        #以二进制保存图片
        with open(down,"wb") as f:
            f.write(img_list.content)
        x += 1
except Exception as e:
    print("pass")

示例3：分页爬取

#!/usr/bin/python3
#coding=utf-8

import os 
import requests
from bs4 import BeautifulSoup

#访问的域名地址
all_url = "http://www.mzitu.com"
#设置请求头
headers_w = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Referer': 'http://www.mzitu.com'
}
headers_i = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Referer': 'http://i.meizitu.net'
}
#发送get请求，获取某个网页，并使用.text属性答应源码信息
start_html = requests.get(all_url.headers=headers_w)
#print(start_html.text)
#定义保存地址
path = "D:\\images\\"

#寻求最大页数，使用bs4模块从html文件中提取数据，使用BeautifulSoup模块解析
soup = BeautifulSoup(start_html.text,"lxml")
#print(soup)
#找出源码中所有包含class="page-numbers"的a标签，以一个列表的形式保存
page = soup.find_all("a",class_="page-number")
#print(page)
#取出next的上一个页面数
max_page = page[-2].text
#print(max_page)

same_url = "http://www.mzitu.com/page/"

for i in range(1,int(max_page)+1):
    #构造每页的url
    page_url = same_url + str(i)
    #print(page_url)
    #请求每页的url
    get_page_url = requests.get(page_url,headers=headers_w)
    #加载每页源码内容
    page_soup = BeautifulSoup(get_page_url.text,"lxml")
    #print(page_soup)
    #将div标签中包含class_="postlist"取出，再取出a标签中target=“_blank”的标签内容
    get_all_a = soup.find("div",class_="postlist").find_all("a",target="_blank")
    #print(get_all_a)
    for a in get_all_a:
        #print(a)
        #从标签中获取所有文字内容
        title = a.get_text()
        #print(title)
        if title != "":
            print("准备爬取：%s" % title)
            #处理字符串，先去除收尾空格，然后将？号替换为空，再将“:”替换成空
            #判断目录是否存在
            #print(path + title.strip().replace("?","").replace(":",""))
            if not os.path.exists(path + title.strip().replace("?","").replace(":","")):
                os.makedirs(path + title.strip().replace("?","").replace(":",""))
            #切换当前目录
            os.chdir(path + title.strip().replace("?","").replace(":",""))
            #获取每一张图片的url，如http://www.mzitu.com/155568
            href = a.get("href")
            #print(href)
            #图片url中取出图片的页数和jpg结尾的图片地址
            html = requests.get(href,)
            msg = BeautifulSoup(html.text,"lxml")
            pic_max = msg.find_all("span")
            #pic_max[10]取出来的是图片页数，如<span>40<span>,<span>41<span>
            pic_max = pic_max[10].text
            #print(pic_max)
            if len(os.listdir(path + title.strip().replace("?","").replace(":",""))) >= int(pic_max):
                print("已保存，跳过")
                continue
            for num in range(1,int(pic_max)+1):
                #print(num)
                pic = href + "/" + str(num)
                #print(pic)
                #url如下：http://www.mzitu.com/155568/44
                #从pic的url中取出图片地址
                html = requests.get(pic.headers=headers_w)
                #print(html.url)
                mess = BeautifulSoup(html.text,"lxml")
                #print(mess)
                pic_url = mess.find("img",alt=title)
                #打印图片地址： <img alt="外拍精彩呈现" src="http://i.meizitu.net/2018/11/01a02.jpg"/>
                #print(pic_url)
                html_img = requests.get(pic_url.get("src"),headers=headers_i)
                #请求每张图片的下载url：http://i.meizitu.net/2018/11/01a02.jpg
                #print(html_img.url)
                file_name = pic_url.get("src").split("/")[-1]
                #print(file_name)

                with open(file_name,"wb") as f:
                    f.write(html_img.content)
                print("图片%s爬取完成" % file_name)
    print("第%s爬取完成" % str(i))

posted on 2022-03-31 10:33 雷子锅阅读(366) 评论(0) 收藏举报

刷新页面返回顶部

导航

安装

常用请求方法

requests.get()

简单使用

发送无参数的get请求

发送无参数的get请求，设置超时时间，默认以秒为单位

发送带参数的请求

定制请求头

Response对象属性

POST请求

示例1：下载单张图片并保存到本地磁盘目录

示例2：使用bs4和requests模块，爬取图片并以1、2、3的形式保存到指定目录

示例3：分页爬取


博客园 © 2004-2025 浙公网安备 33010602011771号浙ICP备2021040463号-3