python--多线程爬取顶点小说()

import requests
from lxml import etree
from threading import Thread
from queue import Queue


class MyThread(Thread):
    def __init__(self, q):
        Thread.__init__(self)
        self.q = q

    def run(self):
        global index
        while not self.q.empty():
            data = self.q.get()
            url = root + ''.join(data[1])
            response = requests.get(url, headers=headers)
            page = etree.HTML(response.content)

            chapter = page.xpath("//h1/text()")
            chapter = ''.join(chapter)
            print("爬取 -> %s" % chapter,index)

            content = page.xpath("//div[@id='content']/text()")
            content = '\n'.join(content)
            content = content.replace("\xa0\xa0\xa0\xa0", "\t")

            # 如果当前标记比保存的小说章节序号大于1,阻塞
            while data[0] > index + 1:
                pass

            # 刚好大于1时,通过,保存章节
            if data[0] == index + 1:
                print("保存 -> %s" % chapter,index)
                f.write('\n' + chapter + '\n')
                f.write(content)
                index += 1


if __name__ == '__main__':
    root = "http://www.booktxt.net/8_8455/"
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    
    index = -1  # 章节标记,表示保存的章数
    
    response = requests.get(root, headers=headers)
    page = etree.HTML(response.content)
    title = ''.join(page.xpath("//h1/text()"))      # 小说名
    print(title)

    with open("%s.txt" % title, 'w', encoding='utf8') as f:
        f.write(title)      # 先写入小说名
        hrefs = page.xpath("//div[@id='list']/dl/dt[2]/following-sibling::dd/a/@href")
        q = Queue()
        for i,href in enumerate(hrefs):
            q.put((i,href))

        ts = []
        for i in range(5):
            t = MyThread(q)
            t.start()
            ts.append(t)
        for t in ts:
            t.join()

  转载自--https://www.cnblogs.com/twoice/p/11405677.html

posted @ 2019-10-11 18:38  传道授业  阅读(735)  评论(0编辑  收藏  举报