Python web crawler(2)网页同步加载的请求格式(Xpath方式)
网页同步加载的特点:
所见即所得:浏览器渲染后的展示字段,和“响应”页面中的“数据内容”、包括右键“查看源码”中完全一致。
同步请求的一般格式
import requests
from lxml import etree
# 同步加载的网站
url = ''
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Cookie': '********************************',
'Referer': '*******************************',
}
response = requests.get(url, headers=headers)
# 获取返回数据 默认解码
data = response.content.decode()
# 把data数据进行xpath处理
tree = etree.HTML(data)
# 根据实际需求循环拿取所需要的数据
tr_list = tree.xpath('//table[@border="1"]/tbody/tr')
for tr in tr_list:
td = tr.xpath('./td/text()')
print(td)
实战演示:
示例1:获取4本书和4本书的url
import requests
from lxml import etree
# 定义目标网页 URL
url = 'https://www.shu.com/bookmark/sidamingzhu.html'
# 设置请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}
# 发起 GET 请求获取网页内容
response = requests.get(url, headers=headers)
data = response.content.decode('utf-8')
# 使用 lxml 解析 HTML 内容
tree = etree.HTML(data)
# 获取所有包含书籍信息的 div 元素列表
div_list = tree.xpath('//div[@class="book-item"]/h3')
# 定义 book_list 空字典
book_list = {}
# 遍历每个 div 元素,提取书名和链接信息
for books in div_list:
# 提取书名
book = books.xpath('./a/text()')[0]
# 提取相对路径的链接信息,拼接完整的 URL
url = 'https://www.shu.com' + books.xpath('./a/@href')[0]
# 把书名和完整的 URL 录入 book_list 字典
book_list[book] = url
print(book_list)
# 运行结果
{'《三国演义》': 'https://www.shu.com/book/sanguoyanyi.html',
'《水浒传》': 'https://www.shu.com/book/shuihuzhuan.html',
'《西游记》': 'https://www.shu.com/book/xiyouji.html',
'《红楼梦》': 'https://www.shu.com/book/hongloumeng.html'}
示例2:获取其中一本书的所有章节
import requests
from lxml import etree
url = 'https://www.shu.com/book/sanguoyanyi.html'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
res = requests.get(url, headers=headers)
tree = etree.HTML(res.content.decode('utf-8'))
# 抓取三国演义章节
# mulu = tree.xpath('//div[@class="book-mulu"]/ul/li/a/text() | //div[@class="book-mulu"]/ul/li/a/@href')
li_list = tree.xpath('//div[@class="book-mulu"]/ul/li')
zhangjie = {}
for li in li_list:
a = (li.xpath('./a/text()')[0])
b = ('https://www.shu.com' + li.xpath('./a/@href')[0])
zhangjie[a] = b
print(zhangjie)
# 运行结果
{'第一回·宴桃园豪杰三结义 斩黄巾英雄首立功': 'https://www.shu.com/book/sanguoyanyi/1.html',
'第二回·张翼德怒鞭督邮 何国舅谋诛宦竖': 'https://www.shu.com/book/sanguoyanyi/2.html',
'第三回·议温明董卓叱丁原 馈金珠李肃说吕布': 'https://www.shu.com/book/sanguoyanyi/3.html',
……
'第一百二十回·荐杜预老将献新谋 降孙皓三分归一统': 'https://www.shu.com/book/sanguoyanyi/120.html'}
示例3:根据章节下载原文内容
import requests
from lxml import etree
url = 'https://www.shicimingju.com/book/sanguoyanyi/1.html'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
res = requests.get(url, headers=headers)
# print(res.content.decode('utf-8'))
tree = etree.HTML(res.content.decode('utf-8'))
# 使用 xpath 抓取章节里的内容
# content = ''.join(tree.xpath('//div[@class="card bookmark-list"]/div/p/text()'))
# print(content)
tittle = tree.xpath('//div[@class="card bookmark-list"]/h1/text()')
data = tree.xpath('//div[@class="card bookmark-list"]/div/p/text()')
# 打印章节
print(tittle[0])
# 打印内容
content = '' # 初始化一个空字符串
for i in data:
content = content + i.strip() + '\n'
print(content)
# 输出内容
第一回·宴桃园豪杰三结义 斩黄巾英雄首立功
滚滚长江东逝水,浪花淘尽英雄。是非成败转头空。青山依旧在,几度夕阳红。 白发渔樵江渚上,惯看秋月春风。一壶浊酒喜相逢。古今多少事,都付笑谈中。
——调寄《临江仙》
……
三人救了董卓回寨。卓问三人现居何职。玄德曰:“白身。”卓甚轻之,不为礼。玄德出,张飞大怒曰:“我等亲赴血战,救了这厮,他却如此无礼。若不杀之,难消我气!”便要提刀入帐来杀董卓。正是:人情势利古犹今,谁识英雄是白身?安得快人如翼德,尽诛世上负心人!
毕竟董卓性命如何,且听下文分解。
如果我们要一次性下载所有的四大名著内容,
我们先整理下思路,步骤一共分三大步
- 获取所有四大名著的“书名”和“URL”
- 解析html内容
- 使用xpath获取“书名”和“URL”
- 获取其中一本书所有的“章节名称”和“URL”
- 解析html内容
- 使用xpath获取“所有章节”和“URL”
- 获取其中一个章节的“章节名称”和“章节内容”
- 解析html内容
- 使用xpath获取“章节名称”和“章节内容”
- 写入文件
- 整体函数循环调用
因此,为了满足代码的简洁性、易用性、可维护性特点,我们需要把重复调用的这些封装为函数,并且可以反复使用。
第一部分:封装——解析html内容
“解析html内容”的代码是重复的,并且其中的URL是变量,因此独立封装这部分即可,未来传参URL这部分内容,return tree 即可
def get_html(url):
'''
对url进行请求 返回tree对象
:param url: 四本书、每本书的章节、每章节的内容所对应的url
:return: tree object
'''
response = requests.get(url, headers=headers)
data = response.content.decode()
# 创建tree对象
tree = etree.HTML(data)
return tree
第二部分:封装——使用xpath获取“书名”和“URL”
def get_books(tree):
'''
获取四大名著的url和书名
:param tree: tree对象
:return: dict {'三国演义': 'http://....'}
'''
books = tree.xpath('//div[@class="book-item"]/h3/a')
books_dict = {}
# print(books)
for book in books:
# 抓取四大名著对应的url
url = 'https://www.shu.com' + book.xpath('./@href')[0]
# 抓取四大名著书名
book_name = book.xpath('./text()')[0]
# print(url, book_name)
books_dict[book_name] = url
return books_dict
第三部分:封装——获取其中一本书所有的“章节名称”和“URL”
def book_mulu(tree):
'''
获取四大名著章节的方法
:param tree: tree object
:return: dict {‘章节名称’: 'url'}
'''
a_list = tree.xpath('//div[@class="book-mulu"]/ul/li/a')
mulu_dict = {}
for a in a_list:
# 获取当前章节的url
url = 'https://www.shu.com' + a.xpath('./@href')[0]
# 获取当前章节的名称
mulu = a.xpath('./text()')[0]
mulu_dict[mulu] = url
return mulu_dict
第四部分:封装——使用xpath获取“章节名称”和“章节内容”
def book_mulu_detail(tree):
'''
获取章节的详情内容
:param tree: tree object
:param mulu_name: 章节名称
:return: dict {'章节名称': '章节内容'}
'''
tittle = tree.xpath('//div[@class="card bookmark-list"]/h1/text()')[0]
data = tree.xpath('//div[@class="card bookmark-list"]/div/p/text()')
tittle_content = {}
content = '' # 初始化一个空字符串,并用for循环把data内容依次录入进去(每个<p>一次换行)
for i in data:
content = content + i.strip() + '\n'
tittle_content[tittle] = content
return tittle_content
第五部分:封装——写入文件,每本书的每一个章节,写成一个txt文件
def save_to_file(book_dir, title, content):
'''
将内容写入到文件
:param book: 书名
:param title: 章节
:param content: 内容
:return: 空
'''
# 如果文件夹不存在,则创建
if not os.path.exists(book_dir):
os.makedirs(book_dir)
# 构建文件相对路径 /book/title.txt
file_path = os.path.join(book_dir, f"{title}.txt")
# 写入内容到文件
with open(file_path, 'w', encoding='utf-8') as file:
file.write(content)
主体循环逻辑
for 书名,书URL in 获取书函数(url):
for 章节,章节URL in 获取章节函数(书名,书URL):
获取内容函数(章节,章节URL),返回字典{章节:内容}
写入内容函数(书名,{章节:内容})
完整取书脚本:
import os
import random
import time
import requests
from lxml import etree
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
def get_html(url):
'''
对url进行请求 返回tree对象
:param url: 四本书、每本书的章节、每章节的内容所对应的url
:return: tree object
'''
response = requests.get(url, headers=headers)
data = response.content.decode('utf-8')
# 创建tree对象
tree = etree.HTML(data)
return tree
def get_books(tree):
'''
获取书函数:书名,书URL(四大名著)
:param tree: tree对象
:return: dict {'三国演义': 'http://....'}
'''
books = tree.xpath('//div[@class="book-item"]/h3/a')
books_dict = {}
# print(books)
for book in books:
# 抓取四大名著对应的url
url = 'https://www.shu.com' + book.xpath('./@href')[0]
# 抓取四大名著书名
book_name = book.xpath('./text()')[0]
# print(url, book_name)
books_dict[book_name] = url
return books_dict
def book_mulu(tree):
'''
获取获取章节函数:章节,章节URL
:param tree: tree object
:return: dict {‘章节名称’: 'url'}
'''
a_list = tree.xpath('//div[@class="book-mulu"]/ul/li/a')
mulu_dict = {}
for a in a_list:
# 获取当前章节的url
url = 'https://www.shu.com' + a.xpath('./@href')[0]
# 获取当前章节的名称
mulu = a.xpath('./text()')[0]
mulu_dict[mulu] = url
return mulu_dict
def book_mulu_detail(tree):
'''
获取获取内容函数:章节,内容
:param tree: tree object
:param mulu_name: 章节名称
:return: dict {'章节名称': '章节内容'}
'''
tittle = tree.xpath('//div[@class="card bookmark-list"]/h1/text()')[0]
data = tree.xpath('//div[@class="card bookmark-list"]/div/p/text()')
tittle_content = {}
content = '' # 初始化一个空字符串,并用for循环把data内容依次录入进去(每个<p>一次换行)
for i in data:
content = content + i + '\n'
# content = content + i.strip() + '\n'
tittle_content[tittle] = content
return {tittle: content}
def save_to_file(book_dir, mulu_detail):
'''
内容函数:将内容写入到文件
:param book: 书名
:param mulu_detail: {'章节':'内容'}
:return: 空
'''
# 如果文件夹不存在,则创建
if not os.path.exists(book_dir):
os.makedirs(book_dir)
# 写入内容到文件
for mulu_name, mulu_con in mulu_detail.items():
# 构建文件相对路径 /book/title.txt
file_path = os.path.join(book_dir, f"{mulu_name}.txt")
with open(file_path, 'w', encoding='utf-8') as file:
file.write(mulu_name+'\n')
file.write(mulu_con)
def main():
url = 'https://www.shu.com/bookmark/sidamingzhu.html'
'''
for 书名,书URL in 获取书函数(url):
for 章节,章节URL in 获取章节函数(书名,书URL):
获取内容函数(章节,章节URL),返回字典{章节:内容}
写入内容函数(书名,{章节:内容})
'''
for book_name, book_url in get_books(get_html(url)).items():
print(book_name)
for title, title_url in book_mulu(get_html(book_url)).items():
title_content = book_mulu_detail(get_html(title_url))
save_to_file(book_name, title_content)
print(title, title_url, "下载完成")
time.sleep(random.randint(1, 4)) # 给一个自省时间
exit() # 该代码应该删除,仅为提前退出脚本使用
if __name__ == '__main__':
main()
执行结果