Python爬取三国演义章节标题和内容(bs4爬取,解决中文乱码)
import os.path import requests from bs4 import BeautifulSoup if __name__ == '__main__': if not os.path.exists('./sanguoyanyi'): os.mkdir('./sanguoyanyi') url = 'https://www.shicimingju.com/book/sanguoyanyi.html' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36' } # 获取页面内容 # .encode('latin1').decode('utf-8-sig') 解决中文乱码 page = requests.get(url=url, headers=headers).text.encode('latin1').decode('utf-8-sig') soup = BeautifulSoup(page, 'lxml') a_list = soup.select('.book-mulu ul li a') fp = open('./sanguoyanyi/sanguoyanyi.txt', 'w', encoding='utf-8') for a in a_list: # 获取章节标题 title = a.text # 获取章节对应内容的url href = 'https://www.shicimingju.com/' + a['href'] # 根据url获取内容页面数据 content = requests.get(href, headers=headers) # 设置encoding避免爬取的时候中文乱码 content.encoding = 'utf-8' content_soup = BeautifulSoup(content.text, 'lxml') content_data = content_soup.find('div', class_='chapter_content').text content_data = content_data.replace(u' ', u'') fp.write(title + ':' + content_data + '\n') print('下载章节{0}成功'.format(title)) fp.close() print("下载完成")