Requests 模块

Requests 模块

1 Requests 模块

1.1 简介

Python中封装好的一个基于网络请求的模块。用来模拟浏览器发请求。

Requests支持HTTP连接保持和连接池,支持使用cookie保持会话,支持文件上传,支持自动确定响应内容的编码,支持国际化的 URL 和 POST 数据自动编码。requests 的底层实现其实就是 urllib3

官网:Requests: HTTP for Humans™ — Requests 2.28.1 documentation

1.2 安装方式

pip install requests

2 Requests 处理文本数据

2.1 获取搜狗首页的页面源码数据

import requests

# 指定URL
url = 'https://www.sogou.com/web'
kw = input("请输入要查询的内容:")
params = {
    "query": kw,
}

# 解决UA检测
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
}
# 发起请求
response = requests.get(url=url, params=params,headers=headers)

# 修改响应数据的编码格式
response.encoding = "utf-8"
result = response.text

# 持久化存储结果
with open(kw + ".html", 'w', encoding="utf-8") as f:
    f.write(result)

2.2 获取豆瓣电影数据

import requests

# 指定URL
url = 'https://movie.douban.com/j/chart/top_list'
start_kw = input("请输入要从第几部电影开始获取:")
limit_kw = input("请输入要获取多少电影数据:")

params = {
    "type": "13",
    "interval_id": "100:90",
    "action": "",
    "start": start_kw,
    "limit": limit_kw,
}

# 解决UA检测
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/76.0.3809.132 Safari/537.36",
}
# 发起请求
response = requests.get(url=url, params=params, headers=headers)

# 修改响应数据的编码格式
response.encoding = "utf-8"

# 持久化存储结果
with open("douban.txt", 'w', encoding="utf-8") as f:
    for movie in response.json():
        f.write(movie["title"]+" : " + movie["score"] + "\n")

2.3 获取查询地方的FKC门店数量及地址

肯德基餐厅信息查询 (kfc.com.cn)

import requests
import math


def get_kfc_addr(kw, page="1"):
    # 指定URL
    url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'

    data = {
        "cname": "",
        "pid": "",
        "keyword": kw,
        "pageIndex": page,
        "pageSize": "10",
    }

    # 解决UA检测
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/76.0.3809.132 Safari/537.36",
    }
    # 发起请求
    response = requests.post(url=url, data=data, headers=headers)

    # 修改响应数据的编码格式
    response.encoding = "utf-8"
    return response.json()


kw = input("请输入要查询的地址:")
# 获取当前门店总数
nums = get_kfc_addr(kw)["Table"][0]["rowcount"]
# 得到当前地区页面数量
pages = math.ceil(int(nums) / 10)
# 持久化存储结果
with open(kw + ".txt", 'w', encoding="utf-8") as f:
    f.write(f"共计拥有{nums}门店\n")
    for i in range(int(nums)):
        session = get_kfc_addr(kw, str(i + 1))
        for store in session['Table1']:
            f.write(store["storeName"] + " : " + store["addressDetail"] + "\n")

3 Requests 处理图像数据

3.1 使用 Requests 模块保存图像数据

import requests


# 指定URL
url = 'https://xiaohua-fd.zol-img.com.cn/t_s600x5000/g2/M00/06/07/ChMlWlyq_pKIfD5WABjHxz2KxPoAAJTtgHk_qkAGMff975.gif'

# 解决UA检测
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/76.0.3809.132 Safari/537.36",
}
# 发起请求
img_data = requests.get(url=url, headers=headers).content

# 持久化存储结果
with open("img.gif", 'wb') as f:
    f.write(img_data)

3.2 使用 urllib 模块中 Requests 保存图像数据

from urllib import Requests

# 指定URL
url = 'https://xiaohua-fd.zol-img.com.cn/t_s600x5000/g2/M00/06/07/ChMlWlyq_pKIfD5WABjHxz2KxPoAAJTtgHk_qkAGMff975.gif'

Requests.urlretrieve(url,filename="haha.gif")
  • 简单,但不能使用UA伪装

4 数据解析

数据解析的实现方式:

  • 正则
  • bs4
  • xpath
  • pyquery

4.1 正则

import requests
from urllib import Requests
import re
import os
import time

dirname = "./imglibs"
if not os.path.exists(dirname):
    os.mkdir(dirname)
# 指定URL
url = 'https://xiaohua.zol.com.cn/qutu/%d.html'

# 解决UA检测
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/76.0.3809.132 Safari/537.36",
}
for page in range(1, 3):
    new_url = format(url % page)
    page_text = requests.get(url=new_url, headers=headers).text
    ex = '<div class="summary-text">.*?<img .*?src="(.*?)" alt.*?</div>'
    img_lists = re.findall(ex, page_text, re.S)
    time.sleep(3)
    for img in img_lists:
        img_name = img.split('/')[-1]
        img_path = dirname + "/" + img_name
        Requests.urlretrieve(img, filename=img_path)
        time.sleep(1)
        print(img_name, "下载成功!!!")
  • re.S 表示“.”(不包含外侧双引号)的作用扩展到整个字符串,包括“\n”。

4.2 bs4

  • 原理:
    • 实例化一个BeautifulSoup的对象,将需要被解析的页面源码数据加载到该对象中。
    • 调用BeautifulSoup对象中的相关方法和属性进行标签定位和数据提取
  • 环境的安装:
    • pip install bs4
  • BeautifulSoup的实例化:
    • BeautifulSoup(fp,'lxml'):将本地存储的一个html文档中的数据加载到实例化好的BeautifulSoup对象中
    • BeautifulSoup(page_text,'lxml'):将从互联网上获取的页面源码数据加载到实例化好的BeautifulSoup对象中
  • 输出为列表

4.2.1 bs4 食用方式

from bs4 import BeautifulSoup

f = open("sanguo.html", "r", encoding='utf-8')
# f 是源网页,第二个参数'lxml'是使用 lxml 的解析器
soup = BeautifulSoup(f, 'lxml')
soup.div
soup.find('div',id='main_left')
soup.find('span',class_="nav-arrow")
soup.find_all('img',id='logo')
# id 选择器
soup.select('#logo')
# 类选择器
soup.select('.bookmark-list')
# 类选择器精确指定一个层级
soup.select('.bookmark-list > div > p')
# 类选择器指定多个层级
soup.select('.bookmark-list p')

# 取文本
div_tag = soup.select('#top_left_menu')[0]
# 获取当前标签的文本内容
div_tag.string
# 获取当前标签内所有的文本内容,类型为str
div_tag.text

# 取属性
img_tag = soup.find('img',class_='book-img')
img_tag['src']
img_tag = soup.select('.book-img')[0]
img_tag['src']

4.2.2 利用BS4抓取三国整篇内容

import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/76.0.3809.132 Safari/537.36",
}

sg_url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
sg_response = requests.get(url=sg_url,headers=headers)
sg_response.encoding='utf-8'
sg_text = sg_response.text

soup = BeautifulSoup(sg_text, 'lxml')
a_list = soup.select('.book-mulu a')
f_sg = open('sanguo.txt','w',encoding='utf-8')

for a in a_list:
    title = a.text
    url = 'https://www.shicimingju.com'+a['href']
    det_response = requests.get(url=url,headers=headers)
    det_response.encoding = 'utf-8'
    det_text = det_response.text.replace(u'&nbsp;', u'')
    det_soup = BeautifulSoup(det_text,'lxml')
    sg_content = det_soup.find('div',class_='chapter_content').text
    f_sg.write(title + '\n' + sg_content + '\n')
    f_sg.flush()
    print(title,'下载成功!')
    print(sg_content)
    break
f_sg.close()

4.3 xpath

  • 原理:

    1. 实例化一个etree的对象,然后将即将被解析的页面源码加载到该对象中
    2. 使用etree对象中的xpath方法,结合着不同形式的xpath表达式,实现标签定位和数据提取
  • 环境安装:

    • pip install lxml
  • etree对象的实例化:

    • 本地解析:etree.parse('test.html')
    • Web解析:etree.HTML(page_text)
  • xpath方法的返回值一定是一个列表

  • xpath表达式:

    • 表达式 作用
      nodename 选取此层级节点下的所有子节点
      / 代表从根节点进行选取
      // 可以理解为匹配,就是在所有节点中选取此节点,直到匹配为止
      . 选取当前节点
      选取当前节点上一层(上一级目录)
      @ 选取属性

4.3.1 xpath食用方式

from lxml import etree

# tree = etree.parse('sanguo.html')

# 问题:XMLSyntaxError: Opening and ending tag mismatch: img line 75 and div, line 81, column 19
# 这是由于自己html代码书写不规范,不符合xml解析器的使用规范导致 
# 解决办法:自己创建解析器,同时增加parser参数
text = 'sanguo.html'
parser = etree.HTMLParser(encoding="utf-8")
html = etree.parse(text, parser=parser)

# 标签定位
# 表示从根节点开始寻找,标签与标签之间/表示一个层级
html.xpath('/html/head/title')
# 从任意节点开始寻找,也就是查找所有的div标签
html.xpath('//title')
# 在html/body下进行匹配寻找标签p
html.xpath('/html/body//p')
# 表示从当前的标签开始寻找div
html.xpath('/html/body/./div')

# 属性定位
# 定位div中属性名为class,属性值为'book-mulu'的div标签
html.xpath('//div[@class="book-mulu"]')

# 索引定位,索引值开始位置为
# 定位div中属性名为class,属性值为'book-mulu'的div标签下的第3个li标签中的a标签
html.xpath('//div[@class="book-mulu"]//li[3]/a')

# 取文本
# 取出div中属性名为class,属性值为'book-mulu'的div标签下的第3个li标签中的a标签文本内容
html.xpath('//div[@class="book-mulu"]//li[3]/a/text()')[0]
# 取出div中属性名为class,属性值为'book-mulu'的div标签下的所有a标签内容
html.xpath('//div[@class="book-mulu"]//a/text()')
# 取出div中属性名为class,属性值为'book-mulu'的div标签下的所有内容
html.xpath('//div[@class="book-mulu"]//text()')

# 取出div中属性名为class,属性值为'book-mulu'的div标签下的所有内容,并拼接为一个字符串
html.xpath('//div[@class="book-mulu"]')[0].xpath("string(.)")

# 取属性值
# 取出div中属性名为class,属性值为'book-mulu'的div标签下的所有a标签href属性值
html.xpath('//div[@class="book-mulu"]//a/@href')

4.3.2 利用Xpath抓取页面内容

# 抓取笑话中的标题、作者、内容
import requests
from lxml import etree

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/76.0.3809.132 Safari/537.36",
}

url = 'https://xiaohua.zol.com.cn/lengxiaohua/'

xh_response = requests.get(url=url, headers=headers)
xh_response.encoding = 'gbk'
xh_page = xh_response.text

xh = etree.HTML(xh_page)
title = xh.xpath('//ul[@class="article-list"]//span[@class="article-title"]//a/text()')
li_list = xh.xpath('//ul[@class="article-list"]/li')
for li in li_list:
    title = li.xpath('.//span[@class="article-title"]//a/text()')[0]
    author = li.xpath('.//div[@class="article-source"]')[0].xpath('string(.)')
    content = li.xpath('.//div[@class="summary-text"]//text()')
    content = ''.join(content)
    print(title + " : " + author + "\n" + content + "\n")

4.3.3 利用Xpath抓取美图网图片

import requests
from lxml import etree
import os
import time

dirname = "beautylibs"
if not os.path.exists(dirname):
    os.mkdir(dirname)

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/76.0.3809.132 Safari/537.36",
}

url = 'http://pic.netbian.com/4kmeinv/index_%d.html'
for page in (1, 4):
    if page == 1:
        url = 'http://pic.netbian.com/4kmeinv/'
    else:
        url = format(url % page)
    xh_page = requests.get(url=url, headers=headers).text

    xh_etree = etree.HTML(xh_page)
    img_list = xh_etree.xpath('//ul[@class="clearfix"]/li/a')
    for img in img_list:
        img_url = "http://pic.netbian.com/" + img.xpath('.//img/@src')[0]
        # 若使用中文名则用以下
        #     img_name = img.xpath('.//b/text()')[0].encode('iso-8859-1').decode('gbk')
        img_name = img.xpath('.//img/@src')[0].split('/')[-1]
        img_data = requests.get(url=img_url, headers=headers).content
        img_path = dirname + '/' + img_name
        with open(img_path, 'wb') as f:
            f.write(img_data)
        print(img_name, '下载成功!')
        time.sleep(1)

4.3.4 获取该网页页面中所有城市名称

import requests
from lxml import etree

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/76.0.3809.132 Safari/537.36",
}
aqi_text = requests.get('https://www.aqistudy.cn/historydata/', headers=headers).text
aqi_etree = etree.HTML(aqi_text)
city_list = aqi_etree.xpath('//div[@class="hot"]/div[2]/ul/li/a | //div[@class="all"]/div[@class="bottom"]//a')
for city in city_list:
    city_name = city.xpath('./text()')[0]
    print(city_name)

4.4 数据解析综合实验

  • 下载https://sc.chinaz.com/jianli/free.html网页中所有免费的简历
import requests
from bs4 import BeautifulSoup
from lxml import etree
import os
import time

dir_name = "jianli"
if not os.path.exists(dir_name):
    os.mkdir(dir_name)

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/76.0.3809.132 Safari/537.36",
}

for page in range(1, 3):
    if page == 1:
        url = "https://sc.chinaz.com/jianli/free.html"
    else:
        url = "https://sc.chinaz.com/jianli/free_%d.html" % page
jl_session = requests.session()
jl_page = jl_session.get(url=url, headers=headers).text

jl_soup = BeautifulSoup(jl_page, 'lxml')
jl_items = jl_soup.select('#container a')

for jl_item in jl_items:
    det_url = jl_item['href']
    det_session = requests.session()
    det_page = det_session.get(url=det_url, headers=headers).text
    det_etree = etree.HTML(det_page)
    doc_url = det_etree.xpath('//*[@id="down"]/div[2]/ul/li[3]/a/@href')[0].encode('iso-8859-1').decode('utf-8')
    doc_name = dir_name + '/' + doc_url.split('/')[-1]
    doc_data = requests.get(url=doc_url, headers=headers).content
    with open(doc_name, 'wb') as df:
        df.write(doc_data)
        df.flush()
    print(doc_url)
    time.sleep(1)

5 Requests 高级模块

5.1 代理模块

我们可以对比不同类型的代理的区别,根据代理的匿名程度,代理可以分为如下类别:

  • 高度匿名代理:会将数据包原封不动的转发,在服务端看来就好像真的是一个普通客户端在访问,而记录的IP则是代理服务器的IP。
  • 普通匿名代理:会在数据包上做一些改动,服务器上有可能发现这是个代理服务器,也有一定几率追查到客户端的真实IP。
  • 透明代理:不但改动了数据包,还会告诉服务器客户端的真实IP。
  • 间谍代理:指组织或个人创建的用户记录用户传输的数据,然后进行研究、监控等目的的代理服务器。
  • 抓取代理IP网站中免费主机名称:http://www.kxdaili.com/dailiip.html
proxy_pool = {
    {'HTTP': '111.225.152.82:8089'},
    {'HTTPS': '111.225.152.82:8089'}
    }

page_response = requests.get(url=url, headers=headers, proxies=proxy_pool, timeout=5)

5.2 cookie的处理

  • 手动处理:将cookie封装到headers中

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/76.0.3809.132 Safari/537.36",
        "Cookie": "ASPSESSIONIDSQQDDDAS=PKJNIAHDMDCFENGBPLEJHLPN; Hm_lvt_95884c6bb5f83262667b4cf046ba2712=1669879562; ...",
    }
    
  • 自动处理:session对象。可以创建一个session对象,改对象可以像requests一样进行请求发送。不同之处在于如果在使用session进行请求发送的过程中产生了cookie,则cookie会被自动存储在session对象中。

    jl_session = requests.session()
    jl_page = jl_session.get(url=url, headers=headers).text
    

5.3 综合实验

5.3.1 抓取IP代理网站上的免费节点,并测试其有效性

import json
import requests
from bs4 import BeautifulSoup


class GetProxyIP(object):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/76.0.3809.132 Safari/537.36",
    }
    # proxy_pool = [
    #     {"http": "120.77.172.30:8118"},
    # ]
    check_url = 'https://www.ip.cn/'

    # 验证Proxy IP 有效性
    def check_proxy_ip(self, proxies_ip):
        res = False
        try:
            print(proxies_ip)
            page_response = requests.get(url=self.check_url, headers=self.headers, proxies=proxies_ip, timeout=5)
            if page_response.status_code == 200:
                res = True
        except Exception as error_info:
            res = False
        return res

    # 抓取免费代理IP列表
    def get_proxy_pool(self):
        for page in range(1, 2):
            if page == 1:
                proxy_url = 'http://www.kxdaili.com/dailiip.html'
            else:
                proxy_url = "http://www.kxdaili.com/dailiip/1/%d.html" % page
            proxy_text = requests.get(url=proxy_url, headers=self.headers).text
            proxy_soup = BeautifulSoup(proxy_text, 'lxml')
            proxy_ip_list = proxy_soup.select("tbody > tr")

            # 并生成代理池
            proxy_pool = []
            for proxy_ip in proxy_ip_list:
                ip = proxy_ip.select('td')[0].text
                port = proxy_ip.select('td')[1].text
                proxy_type = proxy_ip.select('td')[3].text
                if "," in proxy_type.strip():
                    ptype_list = proxy_type.split(',')
                    for p_type in ptype_list:
                        tmp_dict = {p_type: ip.strip() + ":" + port.strip()}
                        # proxy_pool.append(tmp_dict)
                        if self.check_proxy_ip(tmp_dict):
                            proxy_pool.append(tmp_dict)
                else:
                    tmp_dict = {proxy_type: ip.strip() + ":" + port.strip()}
                    proxy_pool.append(tmp_dict)
                    if self.check_proxy_ip(tmp_dict):
                        proxy_pool.append(tmp_dict)
            print(proxy_pool)
        return proxy_pool

    def run(self):
        with open("proxy_list.txt", 'w', encoding='utf-8') as f:
            json.dump(self.get_proxy_pool(), f)


if __name__ == '__main__':
    ip = GetProxyIP()
    ip.run()

5.3.2 模拟登录古诗文网

import ddddocr
import requests
from lxml import etree

url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/76.0.3809.132 Safari/537.36",
}

# 1.创建session对象
gsw_session = requests.session()
gsw_text = gsw_session.get(url=url, headers=headers).text

# 2.实例化一个etree对象,对页面动态参数进行数据解析
gsw_etree = etree.HTML(gsw_text)
__VIEWSTATE = gsw_etree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
__VIEWSTATEGENERATOR = gsw_etree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]

# 3.提取验证码下载地址并解析验证码
code_src = gsw_etree.xpath('//*[@id="imgCode"]/@src')[0]
code_url = 'https://so.gushiwen.cn/' + code_src
code_data = gsw_session.get(url=code_url, headers=headers).content
ocr = ddddocr.DdddOcr()
code_text = ocr.classification(code_data)

# 4 构造登录请求Payload,发送请求并将结果保存到本地。
data = {
    '__VIEWSTATE': __VIEWSTATE,
    '__VIEWSTATEGENERATOR': __VIEWSTATEGENERATOR,
    'from': 'http://so.gushiwen.cn/user/collect.aspx',
    'email': '123456@qq.com',
    'pwd': 'fcareypasswd',
    'code': code_text,
    'denglu': '登录',
}
print(data)

gsw_login_text = gsw_session.post(url=url, headers=headers, data=data, ).text
with open('login.html', 'w', encoding='utf-8') as f:
    f.write(gsw_login_text)

5.3.3 抓取梨视频视频

import re
import requests
from lxml import etree
import os
import time

# 创建视频存放目录
dirname = 'pearvideos'
if not os.path.exists(dirname):
    os.mkdir(dirname)
# 解决UA检测
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/76.0.3809.132 Safari/537.36",
}
url = 'https://www.pearvideo.com/category_1'

pear_text = requests.get(url, headers).text
# 解析视频列表
pear_etree = etree.HTML(pear_text)
# xpath通用匹配
det_video_list = pear_etree.xpath('//*[@id="listvideoListUl"]/li | //*[@id="categoryList"]/li')
for det_video in det_video_list:
    det_video_url = 'https://www.pearvideo.com/' + det_video.xpath('./div/a/@href')[0]
    video_title = det_video.xpath('./div/a/div[2]/text()')[0]
    
    video_title = re.findall(r'[^\*"/:?\\|<>]', video_title, re.S)
    video_title = ''.join(video_title)
    contId = det_video_url.split('_')[1]
    videoStatus_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=' + contId
    det_video_headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/76.0.3809.132 Safari/537.36",
        'Referer': det_video_url,
    }
    det_video_text = requests.get(url=videoStatus_url, headers=det_video_headers).json()
    video_url = det_video_text['videoInfo']['videos']['srcUrl']
    video_data = requests.get(video_url, headers).content
    with open(dirname+'/'+video_title + '.mp4', 'wb') as f:
        f.write(video_data)
    time.sleep(3)

posted @ 2022-12-05 12:02  f_carey  阅读(49)  评论(0编辑  收藏  举报