Python XPath抓取小说《三国演义》 《三》 多线程简单实例
增加多线程抓取数据,增加url判断,若数据已抓取,不在重复抓取 (可参考URL管理器)
需要再添加上队列,否则全开
from lxml import etree import requests import time import os import random import urllib3 from multiprocessing import Pool import _thread import threading def getHeaders(): #随机获取一个headers user_agents = ['Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36', 'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36', 'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36' ] headers = {'User-Agent': random.choice(user_agents),'Connection':'close'} return headers """ request请求头 """ def getRequestHtml(target): req = "" try: req = requests.get(url = target,headers = getHeaders(),verify=False,proxies=None) req.encoding = "gb2312" requests.adapters.DEFAULT_RETRIES = 5 urllib3.disable_warnings() html = req.text return html except requests.exceptions.ConnectionError: req.status_code = "Connection refused" """ 获取章节列表和地址 """ def getContents(target,filePath): html = getRequestHtml(target) bookdata = etree.HTML(html) table_list = bookdata.xpath('//table[9]//tr[1]//td[2]//table[4]//tr[1]//td[1]//table[1]//a') return table_list """ 获取小说内容 """ def getContent(filePath, title,target): html = getRequestHtml(target) bookdata = etree.HTML(html) table_list = bookdata.xpath('//table[5]//tr[1]//td[2]//text()') saveData(filePath, title, table_list) """ 将小说内容写入到文件 """ def saveData(filepath, name, text): isExists = os.path.exists(filepath) if not isExists: os.makedirs(filepath) url = filepath+name+".txt" with open(url, mode="w", encoding="UTF-8") as f: f.writelines(text) f.write('\n\n') class myThread(threading.Thread): def __init__(self,filePath,title,url): threading.Thread.__init__(self) self.filePath = filePath self.title = title self.url = url def run(self): getContent(self.filePath, self.title, self.url) if __name__ == '__main__': #三国演义 目录地址 target = "https://www.kanunu8.com/files/old/2011/2447.html" filePath = "D:\\小说\\三国演义\\" #获取目录列表和地址列表 title_list = getContents(target,filePath) t_start = time.time() threadlist = [] for t in title_list: title = t.text url = "https://www.kanunu8.com/files/old/2011/"+t.get('href') print(title,url) #先判断是否已经抓取过了该页面 isEx = os.path.isfile(filePath+title+".txt") if not isEx: try: thread1 = myThread(filePath, title, url) thread1.setDaemon(True) # 设置守护线程,父线程会等待子线程执行完后再退出 thread1.start() threadlist.append(thread1) except: print("无法启动线程") else: print("该文件已经存在 不需要再次抓取") for tt in threadlist: tt.join() t_end = time.time() print('抓取本书耗时= %s' % (t_end - t_start))