Python爬取三国演义章节标题和内容(bs4爬取,解决中文乱码)

import os.path

import requests
from bs4 import BeautifulSoup

if __name__ == '__main__':
    if not os.path.exists('./sanguoyanyi'):
        os.mkdir('./sanguoyanyi')

    url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }
    # 获取页面内容
    # .encode('latin1').decode('utf-8-sig') 解决中文乱码
    page = requests.get(url=url, headers=headers).text.encode('latin1').decode('utf-8-sig')
    soup = BeautifulSoup(page, 'lxml')
    a_list = soup.select('.book-mulu ul li a')
    fp = open('./sanguoyanyi/sanguoyanyi.txt', 'w', encoding='utf-8')
    for a in a_list:
        # 获取章节标题
        title = a.text
        # 获取章节对应内容的url
        href = 'https://www.shicimingju.com/' + a['href']
        # 根据url获取内容页面数据
        content = requests.get(href, headers=headers)
        # 设置encoding避免爬取的时候中文乱码
        content.encoding = 'utf-8'
        content_soup = BeautifulSoup(content.text, 'lxml')
        content_data = content_soup.find('div', class_='chapter_content').text
        content_data = content_data.replace(u' ', u'')
        fp.write(title + ':' + content_data + '\n')
        print('下载章节{0}成功'.format(title))
    fp.close()
    print("下载完成")

 

posted @ 2022-10-18 20:20  没有童话的鱼  阅读(231)  评论(0编辑  收藏  举报