80小说爬取

import os
import random
import re
import time

import lxml.etree
import requests
import faker

fake = faker.Faker()

uaList = []
for i in range(0, 10):
    uaList.append(fake.user_agent())

headers = {
    "User-Agent": random.choice(uaList)
}


def request_view(response):
    import webbrowser
    request_url = response.url
    base_url = '<head><base href="%s">' % (request_url)
    base_url = base_url.encode()
    content = response.content.replace(b"<head>", base_url)
    tem_html = open('tmp.html', 'wb')
    tem_html.write(content)
    tem_html.close()
    webbrowser.open_new_tab('tmp.html')


class Crawl:
    host = ""
    headers = ""

    def __init__(self, host, headers):
        self.host = host
        self.headers = headers

    def get_content(self, url):
        requests.adapters.DEFAULT_RETRIES = 5  # 增加重连次数
        s = requests.session()
        s.keep_alive = True  # 关闭多余连接

        resp = s.get(url, headers=self.headers)
        if resp.status_code != 200:
            print("crawl url error " + url + str(resp.status_code))
            content = None
        else:
            content = resp.content
        return content

    def get_novel_list(self, content, code):
        html = lxml.etree.HTML(content)
        list = html.xpath('//div[@class="searchlist_l_box"]/ul//li')
        if len(list) > 0:
            for li in list:
                hrefs = li.xpath("./a/@href")
                if len(hrefs) > 0:
                    for href in hrefs:
                        detail_url = self.join_url(href)
                        self.get_download_url(detail_url, code)

    def join_url(self, url):
        return "http://" + self.host + url

    def get_download_url(self, detail_url, code):
        content = self.get_content(detail_url)
        html = lxml.etree.HTML(content)
        title = html.xpath('//dd[@class="bt"]/h2/text()')
        download_url = html.xpath('//div[@class="downlinks"]//a/@href')

        if len(title) == 1 and len(download_url) >= 1:
            title = title[0]
            download_url = download_url[0]
            download_url = self.join_url(download_url)
            self.download_url(download_url, title, code)

    def download_url(self, url, title, code):
        title = re.sub(r'[?\\*|“<>:/]', '', title)
        content = self.get_content(url)
        html = lxml.etree.HTML(content)
        txt_url = html.xpath('//div[@class="downlist"][1]/li/strong/a/@href')
        if len(txt_url) == 1:
            self.download_txt(txt_url[0], title, code)

    def download_txt(self, url, title, code):

        path = "E:\\xiaoshuo\\" + code
        isExists = os.path.exists(path)
        if not isExists:
            os.makedirs(path)
        file = path + "\\" + title+".txt"
        if not os.path.exists(file):
            content = self.get_content(url)
            with open(file, "wb") as f:
                f.write(content)
                f.close()
            print("download success " + title)
        else:
            print(file + " exists")

    def start(self):
        # list_code = [  'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
        #              'U', 'V', 'W', 'X', 'Y', 'Z']
        #
        # for code in list_code:
        code = "K"
        url = "http://" + self.host + "/" + code + ".html"
        content = self.get_content(url)
        if content is not None:
            print("crawl url success:" + url)
            self.get_novel_list(content, code)


if __name__ == "__main__":
    host = "www.txt80.com"
    crawl = Crawl(host, headers)
    try:
        crawl.start()
    except Exception as e:
        print(str(e))

 

posted @ 2021-11-27 10:41  brady-wang  阅读(117)  评论(0编辑  收藏  举报