python爬虫1-requests库

requests库

requests 提供发送网络请求和处理响应的方法

安装

pip install requests

GET 请求

import requests

url = 'https://www.baidu.com/'                  # url
params = {'key1': 'value1', 'key2': 'value2'}   # 参数

# 发送get请求
response = requests.get(url, params=params)

print(response.url)     # 查看 URL,包含查询参数 https://www.baidu.com/?key1=value1&key2=value2
print(response.text)    # 返回内容

POST请求

import requests

url = 'https://www.baidu.com/'  # url
data = {'key': 'value'}         # data数据

#发送post请求
response = requests.post(url, data=data)

print(response.text)    # 打印响应的文本内容

自定义请求头

import requests

url = 'https://www.baidu.com/'

# 自定义请求头
headers = {
    'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0'
}
response = requests.get(url, headers=headers)

print(response.text)

异常处理

import requests
from requests.exceptions import RequestException

try:
    response = requests.get('https://www.baidu.com/')
    response.raise_for_status()  # 如果响应的 HTTP 状态码不是 200,则抛出 HTTPError 异常
    print(response.text)
except RequestException as e:
    print(e)

豆瓣实例

import requests

# 获取请求对象
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100:90&action=&'
# 头部信息
headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/58.0.3029.110 Safari/537.36 '
    }

# 页码
start_page = int(input('起始页码:'))
end_page = int(input('结束的页码:'))

# 循环请求数据
for page in range(start_page, end_page + 1):
    params = {
        'start': (page - 1) * 20,
        'limit': 20
    }
    # 获取网页信息
    resp = requests.get(url=url, headers=headers, params=params)
    resp.encoding = "utf-8"
    # 保存为json文本
    with open(f'douban{page}.json', 'w', encoding='utf-8') as fp:
        fp.write(resp.text)
posted @ 2024-07-04 17:33  noahze  阅读(1)  评论(0编辑  收藏  举报