# -*- coding: utf-8 -*-
# @Time : 2020/9/15 22:23
# @Author : aqiong
# @Site :
# @File : bs4三国演义练习.py
# @Software: PyCharm
#https://www.shicimingju.com/book/sanguoyanyi.html
from bs4 import BeautifulSoup
import requests
import lxml
import os
#https://www.shicimingju.com/book/sanguoyanyi/1.html
if __name__ == '__main__':
if os.path.exists('./三国演义.txt'):
os.remove('./三国演义.txt')
print('已存在 三国演义.txt ,现删除')
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
html = requests.get(url=url,headers=headers).text
soup = BeautifulSoup(html,'lxml')
list_a=soup.select('.book-mulu>ul>li>a')#获得.book-mulu>ul>li>a这个层级下的所有内容,且以列表的形式返回
for index in list_a:#获得每个章回的内容
text_url = 'https://www.shicimingju.com'+index['href']#index['href']获得标签href属性值,因为根据点击章回进入内容,可以发现url是https://www.shicimingju.com与herf属性值拼接而成
text_html = requests.get(url=text_url,headers=headers).text
text_soup = BeautifulSoup(text_html, 'lxml')
text=text_soup.find('div',class_='chapter_content').text#获得div的class为chapter_content的内容
with open('./三国演义.txt','a',encoding='utf-8') as fp:
print(index.text)
fp.write(index.text)#章回名
fp.write(text)#章回内容