Python web crawler（2）网页同步加载的请求格式（Xpath方式）

网页同步加载的特点：

所见即所得：浏览器渲染后的展示字段，和“响应”页面中的“数据内容”、包括右键“查看源码”中完全一致。

同步请求的一般格式

import requests
from lxml import etree

# 同步加载的网站
url = ''
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Cookie': '********************************',
    'Referer': '*******************************',
}
response = requests.get(url, headers=headers)
# 获取返回数据  默认解码
data = response.content.decode()
# 把data数据进行xpath处理
tree = etree.HTML(data)

# 根据实际需求循环拿取所需要的数据
tr_list = tree.xpath('//table[@border="1"]/tbody/tr')
for tr in tr_list:
    td = tr.xpath('./td/text()')
    print(td)

实战演示：

示例1：获取4本书和4本书的url

import requests
from lxml import etree

# 定义目标网页 URL
url = 'https://www.shu.com/bookmark/sidamingzhu.html'

# 设置请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}

# 发起 GET 请求获取网页内容
response = requests.get(url, headers=headers)
data = response.content.decode('utf-8')

# 使用 lxml 解析 HTML 内容
tree = etree.HTML(data)

# 获取所有包含书籍信息的 div 元素列表
div_list = tree.xpath('//div[@class="book-item"]/h3')

# 定义 book_list 空字典
book_list = {}
# 遍历每个 div 元素，提取书名和链接信息
for books in div_list:
    # 提取书名
    book = books.xpath('./a/text()')[0]
    # 提取相对路径的链接信息，拼接完整的 URL
    url = 'https://www.shu.com' + books.xpath('./a/@href')[0]
    # 把书名和完整的 URL 录入 book_list 字典
    book_list[book] = url

print(book_list)

# 运行结果
{'《三国演义》': 'https://www.shu.com/book/sanguoyanyi.html', 
 '《水浒传》': 'https://www.shu.com/book/shuihuzhuan.html', 
 '《西游记》': 'https://www.shu.com/book/xiyouji.html', 
 '《红楼梦》': 'https://www.shu.com/book/hongloumeng.html'}

示例2：获取其中一本书的所有章节

import requests
from lxml import etree

url = 'https://www.shu.com/book/sanguoyanyi.html'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}

res = requests.get(url, headers=headers)
tree = etree.HTML(res.content.decode('utf-8'))

# 抓取三国演义章节
# mulu = tree.xpath('//div[@class="book-mulu"]/ul/li/a/text() | //div[@class="book-mulu"]/ul/li/a/@href')
li_list = tree.xpath('//div[@class="book-mulu"]/ul/li')
zhangjie = {}
for li in li_list:
    a = (li.xpath('./a/text()')[0])
    b = ('https://www.shu.com' + li.xpath('./a/@href')[0])
    zhangjie[a] = b

print(zhangjie)

# 运行结果
{'第一回·宴桃园豪杰三结义  斩黄巾英雄首立功': 'https://www.shu.com/book/sanguoyanyi/1.html', 
 '第二回·张翼德怒鞭督邮    何国舅谋诛宦竖': 'https://www.shu.com/book/sanguoyanyi/2.html', 
 '第三回·议温明董卓叱丁原  馈金珠李肃说吕布': 'https://www.shu.com/book/sanguoyanyi/3.html',
 ……
 '第一百二十回·荐杜预老将献新谋  降孙皓三分归一统': 'https://www.shu.com/book/sanguoyanyi/120.html'}

示例3：根据章节下载原文内容

import requests
from lxml import etree

url = 'https://www.shicimingju.com/book/sanguoyanyi/1.html'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}

res = requests.get(url, headers=headers)
# print(res.content.decode('utf-8'))
tree = etree.HTML(res.content.decode('utf-8'))
# 使用 xpath 抓取章节里的内容

# content = ''.join(tree.xpath('//div[@class="card bookmark-list"]/div/p/text()'))
# print(content)

tittle = tree.xpath('//div[@class="card bookmark-list"]/h1/text()')
data = tree.xpath('//div[@class="card bookmark-list"]/div/p/text()')

# 打印章节
print(tittle[0])

# 打印内容
content = ''  # 初始化一个空字符串
for i in data:
    content = content + i.strip() + '\n'

print(content)
    
# 输出内容
第一回·宴桃园豪杰三结义  斩黄巾英雄首立功
滚滚长江东逝水，浪花淘尽英雄。是非成败转头空。青山依旧在，几度夕阳红。    白发渔樵江渚上，惯看秋月春风。一壶浊酒喜相逢。古今多少事，都付笑谈中。
——调寄《临江仙》
……
三人救了董卓回寨。卓问三人现居何职。玄德曰：“白身。”卓甚轻之，不为礼。玄德出，张飞大怒曰：“我等亲赴血战，救了这厮，他却如此无礼。若不杀之，难消我气！”便要提刀入帐来杀董卓。正是：人情势利古犹今，谁识英雄是白身？安得快人如翼德，尽诛世上负心人！
毕竟董卓性命如何，且听下文分解。

如果我们要一次性下载所有的四大名著内容，

我们先整理下思路，步骤一共分三大步

获取所有四大名著的“书名”和“URL”
1. 解析html内容
2. 使用xpath获取“书名”和“URL”
获取其中一本书所有的“章节名称”和“URL”
1. 解析html内容
2. 使用xpath获取“所有章节”和“URL”
获取其中一个章节的“章节名称”和“章节内容”
1. 解析html内容
2. 使用xpath获取“章节名称”和“章节内容”
写入文件
整体函数循环调用

因此，为了满足代码的简洁性、易用性、可维护性特点，我们需要把重复调用的这些封装为函数，并且可以反复使用。

第一部分：封装——解析html内容

“解析html内容”的代码是重复的，并且其中的URL是变量，因此独立封装这部分即可，未来传参URL这部分内容，return tree 即可

def get_html(url):
    '''
    对url进行请求  返回tree对象
    :param url:  四本书、每本书的章节、每章节的内容所对应的url
    :return: tree  object
    '''
    response = requests.get(url, headers=headers)
    data = response.content.decode()
    # 创建tree对象
    tree = etree.HTML(data)
    return tree

第二部分：封装——使用xpath获取“书名”和“URL”

def get_books(tree):
    '''
    获取四大名著的url和书名
    :param tree: tree对象
    :return: dict  {'三国演义': 'http://....'}
    '''
    books = tree.xpath('//div[@class="book-item"]/h3/a')
    books_dict = {}
    # print(books)
    for book in books:
        # 抓取四大名著对应的url
        url = 'https://www.shu.com' + book.xpath('./@href')[0]
        # 抓取四大名著书名
        book_name = book.xpath('./text()')[0]
        # print(url, book_name)
        books_dict[book_name] = url
    return books_dict

第三部分：封装——获取其中一本书所有的“章节名称”和“URL”

def book_mulu(tree):
    '''
    获取四大名著章节的方法
    :param tree: tree object
    :return: dict  {‘章节名称’: 'url'}
    '''
    a_list = tree.xpath('//div[@class="book-mulu"]/ul/li/a')
    mulu_dict = {}
    for a in a_list:
        # 获取当前章节的url
        url = 'https://www.shu.com' + a.xpath('./@href')[0]
        # 获取当前章节的名称
        mulu = a.xpath('./text()')[0]
        mulu_dict[mulu] = url
    return mulu_dict

第四部分：封装——使用xpath获取“章节名称”和“章节内容”

def book_mulu_detail(tree):
    '''
    获取章节的详情内容
    :param tree: tree object
    :param mulu_name: 章节名称
    :return: dict {'章节名称': '章节内容'}
    '''
    tittle = tree.xpath('//div[@class="card bookmark-list"]/h1/text()')[0]
    data = tree.xpath('//div[@class="card bookmark-list"]/div/p/text()')
    tittle_content = {}
    content = ''  # 初始化一个空字符串,并用for循环把data内容依次录入进去（每个<p>一次换行）
    for i in data:
        content = content + i.strip() + '\n'
        
    tittle_content[tittle] = content
    return tittle_content

第五部分：封装——写入文件，每本书的每一个章节，写成一个txt文件

def save_to_file(book_dir, title, content):
    '''
    将内容写入到文件
    :param book: 书名
    :param title: 章节
    :param content: 内容
    :return: 空
    '''

    # 如果文件夹不存在，则创建
    if not os.path.exists(book_dir):
        os.makedirs(book_dir)

    # 构建文件相对路径 /book/title.txt
    file_path = os.path.join(book_dir, f"{title}.txt")

    # 写入内容到文件
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

主体循环逻辑

for 书名,书URL in 获取书函数(url):
    for 章节,章节URL in 获取章节函数(书名,书URL):
        获取内容函数(章节,章节URL)，返回字典{章节:内容}
        写入内容函数(书名,{章节:内容})

完整取书脚本：

import os
import random
import time
import requests
from lxml import etree

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}

def get_html(url):
    '''
    对url进行请求  返回tree对象
    :param url:  四本书、每本书的章节、每章节的内容所对应的url
    :return: tree  object
    '''
    response = requests.get(url, headers=headers)
    data = response.content.decode('utf-8')
    # 创建tree对象
    tree = etree.HTML(data)
    return tree

def get_books(tree):
    '''
    获取书函数：书名,书URL（四大名著）
    :param tree: tree对象
    :return: dict  {'三国演义': 'http://....'}
    '''
    books = tree.xpath('//div[@class="book-item"]/h3/a')
    books_dict = {}
    # print(books)
    for book in books:
        # 抓取四大名著对应的url
        url = 'https://www.shu.com' + book.xpath('./@href')[0]
        # 抓取四大名著书名
        book_name = book.xpath('./text()')[0]
        # print(url, book_name)
        books_dict[book_name] = url
    return books_dict

def book_mulu(tree):
    '''
    获取获取章节函数：章节,章节URL
    :param tree: tree object
    :return: dict  {‘章节名称’: 'url'}
    '''
    a_list = tree.xpath('//div[@class="book-mulu"]/ul/li/a')
    mulu_dict = {}
    for a in a_list:
        # 获取当前章节的url
        url = 'https://www.shu.com' + a.xpath('./@href')[0]
        # 获取当前章节的名称
        mulu = a.xpath('./text()')[0]
        mulu_dict[mulu] = url
    return mulu_dict

def book_mulu_detail(tree):
    '''
    获取获取内容函数：章节,内容
    :param tree: tree object
    :param mulu_name: 章节名称
    :return: dict {'章节名称': '章节内容'}
    '''
    tittle = tree.xpath('//div[@class="card bookmark-list"]/h1/text()')[0]
    data = tree.xpath('//div[@class="card bookmark-list"]/div/p/text()')
    tittle_content = {}
    content = ''  # 初始化一个空字符串,并用for循环把data内容依次录入进去（每个<p>一次换行）
    for i in data:
        content = content + i + '\n'
        # content = content + i.strip() + '\n'

    tittle_content[tittle] = content
    return {tittle: content}


def save_to_file(book_dir, mulu_detail):
    '''
    内容函数：将内容写入到文件
    :param book: 书名
    :param mulu_detail: {'章节'：'内容'}
    :return: 空
    '''

    # 如果文件夹不存在，则创建
    if not os.path.exists(book_dir):
        os.makedirs(book_dir)

    # 写入内容到文件
    for mulu_name, mulu_con in mulu_detail.items():
        # 构建文件相对路径 /book/title.txt
        file_path = os.path.join(book_dir, f"{mulu_name}.txt")
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(mulu_name+'\n')
            file.write(mulu_con)

def main():
    url = 'https://www.shu.com/bookmark/sidamingzhu.html'
    '''
    for 书名,书URL in 获取书函数(url):
        for 章节,章节URL in 获取章节函数(书名,书URL):
            获取内容函数(章节,章节URL)，返回字典{章节:内容}
            写入内容函数(书名,{章节:内容})
    '''
    for book_name, book_url in get_books(get_html(url)).items():
        print(book_name)
        for title, title_url in book_mulu(get_html(book_url)).items():
            title_content = book_mulu_detail(get_html(title_url))
            save_to_file(book_name, title_content)
            print(title, title_url, "下载完成")
            time.sleep(random.randint(1, 4))  # 给一个自省时间
        exit() # 该代码应该删除，仅为提前退出脚本使用

if __name__ == '__main__':
    main()

执行结果

posted @ 2024-01-22 15:36 Magiclala 阅读(25) 评论(0) 编辑收藏举报

刷新页面返回顶部

Magiclala的博客