爬虫练习:使用bs4爬取诗词名句网的《三国演义》

# -*- coding: utf-8 -*-
# @Time : 2020/9/15 22:23
# @Author : aqiong
# @Site : 
# @File : bs4三国演义练习.py
# @Software: PyCharm
#https://www.shicimingju.com/book/sanguoyanyi.html
from bs4 import BeautifulSoup
import requests
import lxml
import os
#https://www.shicimingju.com/book/sanguoyanyi/1.html
if __name__ == '__main__':
    if os.path.exists('./三国演义.txt'):
        os.remove('./三国演义.txt')
        print('已存在 三国演义.txt ,现删除')
    url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
    }
    html = requests.get(url=url,headers=headers).text
    soup = BeautifulSoup(html,'lxml')

    list_a=soup.select('.book-mulu>ul>li>a')#获得.book-mulu>ul>li>a这个层级下的所有内容,且以列表的形式返回

    for index in list_a:#获得每个章回的内容
        text_url = 'https://www.shicimingju.com'+index['href']#index['href']获得标签href属性值,因为根据点击章回进入内容,可以发现url是https://www.shicimingju.com与herf属性值拼接而成
        text_html = requests.get(url=text_url,headers=headers).text
        text_soup = BeautifulSoup(text_html, 'lxml')
        text=text_soup.find('div',class_='chapter_content').text#获得div的class为chapter_content的内容

        with open('./三国演义.txt','a',encoding='utf-8') as fp:
            print(index.text)
            fp.write(index.text)#章回名
            fp.write(text)#章回内容
posted @ 2020-09-16 12:38  阿琼!!!!!  阅读(276)  评论(0编辑  收藏  举报