爬虫习作-爬小说
# coding=utf8 import traceback from bs4 import BeautifulSoup import requests import re import time #get novel information! https://www.bxwxorg.com/ def getHTMLText(url): try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding#程序分析源码,使用可能的编码进行解码 return r.text #正常,返回网页源码,类型str except: return ""#异常,返回空 def getList(lst,URL,): html = getHTMLText(URL) soup = BeautifulSoup(html, "html.parser")#创建 beautifulsoup 对象 a = soup.find_all('a')#找到所有a链接 for i in a: try: href = i.attrs['href']#拿到具体的链接地址 if (re.findall(r"\d{1,8}", href)[1]) not in lst: lst.append(re.findall(r"\d{1,8}", href)[1])#找到小说章节的链接 except: continue lst.sort(reverse=False)#给列表排序 print("小说共%s章节:"%len(lst)) return lst def getInfo(lst,articlURL,path): timeStart = time.time() # 抓取计时 for articlNum in lst: url = articlURL + articlNum+".html" print(url) html = getHTMLText(url)#获取文章详情页html try: if html == "":#如果404等非200导致的空源码,就不提取 print("%s信息不存在!") soup = BeautifulSoup(html, "html.parser") title = soup.body['article-name']#小说名字 chapter = soup.find('h1').text # 章节名称 print("======开始爬取<%s>信息======" % chapter) fpath = path + title + ".txt" content = soup.find_all('div', id= 'content') with open(fpath, 'a', encoding='utf-8') as f: for i in content: text = i.text.replace('\n', '\r\n')#\n替换成换行 f.write("【"+chapter+"】"+text+"\n"*2) except: traceback.print_exc() print("======error======") print("耗时:%s 秒" %(time.time() - timeStart)) #主函数 lst = [] url = "https://www.bxwxorg.com/read/129669/" getList(lst,url) lst = lst[:len(lst)] #打印所有章节 # lst = lst[:5]#打印前5节 path = "F:\\workspace\\API_test\\Crawlers\\Info\\" getInfo(lst,url,path)
经过几天的努力,终于小有收获!happy~
一切技术都是为业务服务,脱离业务的技术一文不值!