爬虫之多线程、异步
1.使用传统方式爬取“斗图啦”网站的图片
#-*-coding = utf-8 -*- import requests from lxml import etree import re import os.path from urllib import request def parse_page(url): headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} response = requests.get(url=url,headers=headers) text = response.text parse = etree.HTMLParser(encoding='utf-8') html = etree.fromstring(text,parser=parse) images = html.xpath('//div[@class="page-content text-center"]//a//img[@class!="gif"]') for img in images: img_url = img.get("data-original") img_name = img.get("alt") img_name = re.sub(r'[??!!./,,。]','',img_name) img_postfix = os.path.splitext(img_url)[1] img_save_path = os.path.join('E:\study',img_name+img_postfix) request.urlretrieve(img_url,img_save_path) def main(): for page in range(1,101): url = 'https://www.doutula.com/photo/list/?page=%d' %page parse_page(url) if __name__=='__main__': main()
2.使用生产者与消费者模式多线程下载表情包
#-*-coding = utf-8 -*- import requests from lxml import etree import re import os.path from urllib import request import threading from queue import Queue class Producer(threading.Thread): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} def __init__(self,page_queue,image_queue,*args,**kwargs): super(Producer,self).__init__(*args,**kwargs) self.page_queue = page_queue self.image_queue = image_queue def run(self): while True: if self.page_queue.empty(): break url = self.page_queue.get() self.parse_page(url) def parse_page(self,url): response = requests.get(url=url,headers=self.headers) text = response.text parse = etree.HTMLParser(encoding='utf-8') html = etree.fromstring(text,parser=parse) images = html.xpath('//div[@class="page-content text-center"]//a//img[@class!="gif"]') for img in images: img_url = img.get("data-original") img_name = img.get("alt") img_name = re.sub(r'[,。?!*,\\.?/]','',img_name) img_postfix = os.path.splitext(img_url)[1] img_save_path = os.path.join('E:\study',img_name+img_postfix) self.image_queue.put((img_url,img_save_path)) class Consumer(threading.Thread): def __init__(self,page_queue,image_queue,*args,**kwargs): super(Consumer,self).__init__(*args,**kwargs) self.page_queue = page_queue self.image_queue = image_queue def run(self): while True: if self.image_queue.empty() and self.page_queue.empty(): return img = self.image_queue.get(block=True) url,path = img request.urlretrieve(url,path) print(path + "下载完成!") def main(): page_queue = Queue(100) img_queue = Queue(1000) for page in range(1,101): url = 'https://www.doutula.com/photo/list/?page=%d' %page page_queue.put(url) for x in range(5): producer = Producer(page_queue,img_queue) producer.start() for x in range(5): consumer = Consumer(page_queue,img_queue) consumer.start() if __name__=='__main__': main()
使用生产者消费者模式下载内涵段子并保存在csv文件
#-*-coding = utf-8 -*- import requests from lxml import etree import threading from queue import Queue import csv class Producer(threading.Thread): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'} def __init__(self,page_queue,joke_queue,*args,**kwargs): super(Producer,self).__init__(*args,**kwargs) self.page_queue = page_queue self.joke_queue = joke_queue def run(self): while True: if self.page_queue.empty(): break url = self.page_queue.get() self.parse_page(url) def parse_page(self,url): response = requests.get(url=url,headers=self.headers) text = response.text parse = etree.HTMLParser(encoding='utf-8') html = etree.fromstring(text,parser=parse) descs = html.xpath("//div[@class='j-r-list-c-desc']//a") for a in descs: joke = a.xpath("text()") joke_content = "\n".join(joke).strip() link = 'http://www.budejie.com'+ a.xpath("@href")[0] self.joke_queue.put((joke_content,link)) print('=' * 30 + "第%s页下载完成!" % url.split('/')[-1] + "=" * 30) class Consumer(threading.Thread): def __init__(self,page_queue,joke_queue,csvfilepath,*args,**kwargs): super(Consumer,self).__init__(*args,**kwargs) self.page_queue = page_queue self.joke_queue = joke_queue self.lock = threading.Lock()#创建锁 self.csvfilepath = csvfilepath def run(self): while True: if self.joke_queue.empty() and self.page_queue.empty(): return joke ,link= self.joke_queue.get() self.lock.acquire() self.writecsv((joke,link)) self.lock.release() def writecsv(self,row): with open(self.csvfilepath, "a+", newline='', encoding='utf-8')as csvfile: writer = csv.writer(csvfile,dialect='excel') writer.writerow(row) def main(): page_queue = Queue(10) joke_queue = Queue(1000) csvfilepath = r'E:\study\joke.csv' for page in range(1,11): url = 'http://www.budejie.com/text/%d' %page page_queue.put(url) for x in range(5): producer = Producer(page_queue,joke_queue) producer.start() for x in range(5): consumer = Consumer(page_queue,joke_queue,csvfilepath) consumer.start() if __name__=='__main__': main()
>>>>>>>>>>待续