爬虫习作-爬小说

# coding=utf8
import traceback
from bs4 import BeautifulSoup
import requests
import re
import time

#get novel information!  https://www.bxwxorg.com/

def getHTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding#程序分析源码,使用可能的编码进行解码
        return r.text #正常,返回网页源码,类型str
    except:
        return ""#异常,返回空

def getList(lst,URL,):
    html = getHTMLText(URL)
    soup = BeautifulSoup(html, "html.parser")#创建 beautifulsoup 对象
    a = soup.find_all('a')#找到所有a链接
    for i in a:
        try:
            href = i.attrs['href']#拿到具体的链接地址
            if (re.findall(r"\d{1,8}", href)[1]) not in lst:
                lst.append(re.findall(r"\d{1,8}", href)[1])#找到小说章节的链接
        except:
            continue
    lst.sort(reverse=False)#给列表排序
    print("小说共%s章节:"%len(lst))
    return lst

def getInfo(lst,articlURL,path):
    timeStart = time.time()  # 抓取计时
    for articlNum in lst:
        url = articlURL + articlNum+".html"
        print(url)
        html = getHTMLText(url)#获取文章详情页html
        try:
            if html == "":#如果404等非200导致的空源码,就不提取
                print("%s信息不存在!")
            soup = BeautifulSoup(html, "html.parser")
            title = soup.body['article-name']#小说名字
            chapter = soup.find('h1').text  # 章节名称
            print("======开始爬取<%s>信息======" % chapter)
            fpath = path + title + ".txt"
            content = soup.find_all('div', id= 'content')
            with open(fpath, 'a', encoding='utf-8') as f:
                for i in content:
                    text = i.text.replace('\n', '\r\n')#\n替换成换行
                    f.write(""+chapter+""+text+"\n"*2)
        except:
            traceback.print_exc()
            print("======error======")
    print("耗时:%s 秒" %(time.time() - timeStart))

#主函数
lst = []
url = "https://www.bxwxorg.com/read/129669/"
getList(lst,url)
lst = lst[:len(lst)] #打印所有章节
# lst = lst[:5]#打印前5节
path = "F:\\workspace\\API_test\\Crawlers\\Info\\"
getInfo(lst,url,path)

 经过几天的努力,终于小有收获!happy~

posted @ 2021-01-12 15:35  不不田鼠  阅读(122)  评论(0编辑  收藏  举报