小说爬取
# -*-conding:utf-8-*- """ # File : 获取西游记数据.py # Time :2022/4/24 17:38 # Author :希维 # version :python 3.8 # Description: """ import requests from bs4 import BeautifulSoup from fake_useragent import UserAgent from lxml import etree import os def getTitleLink(url): """ :获取目录页上各章节标题和链接 """ headers = { 'user-Agent':UserAgent().random } resp = requests.get(url, headers=headers).content.decode('gbk', errors='ignore') # 获取相应内容,改变内容编码格式 e = etree.HTML(resp) chapters = e.xpath("/html/body/div[5]/div[2]/ul/li/a") print('chapters',chapters) titleLink = {} # 创建一个字典,用于存放每一章节的标题与对应的链接 for each in chapters: title = each.text # print('title:',title) link = ('https://www.gdwxcn.com/'+ str(each.get('href'))) # 构建完整链接 # print('link:',link) titleLink[title] = link #将新的标题和链接信息添加到字典 return titleLink # 返回目录页获取的章节与连接数据 def getText(url): """ :用于获取对应的文本信息 """ headers = { 'user-Agent': UserAgent().random } resp = requests.get(url, headers=headers).content.decode('gbk', errors='ignore') # 获取相应内容,改变内容编码格式 # print(resp) e = etree.HTML(resp) text = e.xpath('/html/body/div[5]/div/div[1]/p/text()') print('text:',text) return text def mkdir(path): """ 创建文件夹 """ folder = os.path.exists(path) if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹 os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径 print("--- new folder... ---") print("--- OK ---") else: print("--- There is this folder! ---") def main(): """ :定义函数获取小说 """ url = 'https://www.gdwxcn.com/gdxs/xyj/' titleLink = getTitleLink(url) # print('titlelink', titleLink) mkdir("D:\\Users\\Donal\\Documents\\pythonProject\\爬虫\\西游记小说爬取/西游记") # 创建文件夹 i = 1 for title, link in titleLink.items(): with open('D:\\Users\\Donal\\Documents\\pythonProject\\爬虫\\西游记小说爬取/西游记/'+str(i)+str(title)+'.txt', 'w',encoding='utf-8') as f: for text in getText(link): f.write(text) f.close() i += 1 if __name__ == '__main__': main()