第10课-队列、使用多线程和队列的爬虫案例
1、队列代码示例
import threading import time from queue import Queue ''' Queue是线程安全的队列 ''' def set_data(q): index = 0 while True: q.put(index) index += 1 time.sleep(3) def get_data(q): while True: print(q.get()) if __name__ == '__main__': q = Queue(4) t1 = threading.Thread(target=set_data,args=[q]) t2 = threading.Thread(target=get_data,args=[q]) t1.start() t2.start() q = Queue(1) q.put(1) q.get(timeout=1) print(q.empty()) print(q.full(timeout=1)) print(q.qsize())
2、斗图爬虫实战
import requests import threading from queue import Queue from lxml import etree from urllib import request g_flag = True HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } #爬取图片地址 def put_picture_link(q): global g_flag for i in range(1,11): text = requests.get(url="http://www.doutula.com/article/list/?page={}".format(i), headers=HEADERS).text html = etree.HTML(text) imgs_elements = html.xpath( '//div[@class="col-sm-9 center-wrap"]/a/div[@class="random_article"]/div/img [@class!="gif"]') for img_element in imgs_elements: image_link = img_element.xpath("@data-original")[0] q.put(image_link) g_flag = False #下载图片 def download(q): index = 1 while g_flag or q.qsize()>0: img_link = q.get(timeout=1) result = requests.get(url=img_link) if result.status_code == 200: my_picture = result.content append = img_link.split(".")[-1] with open("c://pictures/{}.{}".format(index, append), "wb") as fp: fp.write(my_picture) #文件下载 # request.urlretrieve(url=img_link,filename="c://pictures/{}.{}".format(index, append)) index += 1 if __name__ == '__main__': q = Queue(10) #初始化队列 t1 = threading.Thread(target=put_picture_link,args=[q]) t1.start() t2 = threading.Thread(target=download,args=[q]) t2.start() print("主线程执行完毕!!!")
3、百思不得姐爬虫实战
"""百思不得姐爬虫实战""" import threading from lxml import etree import requests from queue import Queue import csv g_Lock = threading.Lock() g_flag = True DOMAIN = "http://www.budejie.com/" HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } #生产者 class Producer(threading.Thread): def __init__(self,queue_url,queue_content): super(Producer,self).__init__() self.__queue_url = queue_url self.__queue_content = queue_content def run(self): global g_flag # count = 1 while self.__queue_url.qsize()>0: url = self.__queue_url.get() text = requests.get(url= url,headers = HEADERS).text html = etree.HTML(text) contents = html.xpath('//div[@class="g-mn"]//div[@class="j-r-list"]//ul//div[@class="j-r-list-c-desc"]/a') for c in contents: content = c.xpath("text()")[0].replace(r"\u200b","") link = DOMAIN + c.xpath("@href")[0] content_dict = {} content_dict["段子"] = content content_dict["链接"] = link self.__queue_content.put(content_dict) # print("第{}个页面请求成功".format(count)) # count += 1 g_flag = False print("-----------------------所有请求已完成---------------") #消费者 class Consumer(threading.Thread): def __init__(self,queue_content,writer,i): super(Consumer,self).__init__() self.__queue_content = queue_content self.__writer = writer self.__i = i def run(self): print("----dddddddddddddd---") while True: if self.__queue_content.qsize()>0 or g_flag : try: content_dict = self.__queue_content.get(timeout=1) g_Lock.acquire() self.__writer.writerow(content_dict) g_Lock.release() except Exception as e: print("队列为空{}".format(e)) else: break print("线程{}".format(self.__i),g_flag, self.__queue_content.qsize()) if __name__ == '__main__': q_url = Queue(100) q_content = Queue(100) for i in range(1,25): q_url.put("http://www.budejie.com/text/{}".format(i)) header = ["段子","链接"] fp = open("text.csv","w",encoding="utf-8",newline="") writer = csv.DictWriter(fp,header) writer.writeheader() for i in range(0,1): c = Consumer(q_content,writer,i) c.start() p = Producer(q_url,q_content) p.start()