爬虫_斗图啦(队列,多线程)
1 import threading 2 import requests 3 from lxml import etree 4 from urllib import request 5 import os 6 import re 7 from queue import Queue 8 9 10 class Producer(threading.Thread): 11 headers = { 12 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' 13 } 14 def __init__(self,page_queue,img_queue,*args,**kwargs): 15 super(Producer, self).__init__(*args,**kwargs) 16 self.page_queue = page_queue 17 self.img_queue = img_queue 18 19 20 def run(self): 21 while True: 22 if self.page_queue.empty(): 23 break 24 url = self.page_queue.get() 25 self.parse_page(url) 26 27 28 def parse_page(self,url): 29 response = requests.get(url,headers=self.headers) 30 text = response.text 31 html = etree.HTML(text) 32 imgs = html.xpath("//div[@class='page-content text-center']//a//img") 33 for img in imgs: 34 if img.get('class') == 'gif': 35 continue 36 img_url = img.xpath(".//@data-original")[0] 37 suffix = os.path.splitext(img_url)[1] 38 alt = img.xpath(".//@alt")[0] 39 alt = re.sub(r'[,。??,/\\·]','',alt) 40 img_name = alt + suffix 41 self.img_queue.put((img_url,img_name)) 42 43 44 class Consumer(threading.Thread): 45 def __init__(self,page_queue,img_queue,*args,**kwargs): 46 super(Consumer, self).__init__(*args,**kwargs) 47 self.page_queue = page_queue 48 self.img_queue = img_queue 49 50 51 def run(self): 52 while True: 53 if self.img_queue.empty(): 54 if self.page_queue.empty(): 55 return 56 img = self.img_queue.get(block=True) 57 url,filename = img 58 request.urlretrieve(url,'images/'+filename) 59 print(filename+' 下载') 60 61 62 def main(): 63 page_queue = Queue(100) 64 img_queue = Queue(500) 65 66 for x in range(1,101): 67 url = "http://www.doutula.com/photo/list/?page=%d" % x 68 page_queue.put(url) 69 for x in range(5): 70 t = Producer(page_queue,img_queue) 71 t.start() 72 for x in range(5): 73 t = Consumer(page_queue,img_queue) 74 t.start() 75 76 77 if __name__ == '__main__': 78 main()
下载是相当快啊