爬虫——生产者消费者
结构
生产者生成网址并放入队列
多个消费者从队列中取出网址
1 from queue import Queue 2 import time, threading, requests 3 4 url_base = 'http://www.qiushibaike.com/8hr/page/{}/' 5 header = {} 6 7 def load_data(): 8 return [url_base.format(i) for i in [1, 3, 6, 7]] 9 10 #生产者 11 def produce(q): 12 index = 0 13 data = load_data() 14 while True: 15 if index < len(data): 16 q.put(data[index]) 17 index += 1 18 19 #消费者 20 def consume(q): 21 while True: 22 download_url = q.get() 23 # requests.get(download_url,headers=header) 24 print('thread is {} content is {}'.format(threading.current_thread(), download_url)) 25 26 def main(): 27 q = Queue(4) 28 p1 = threading.Thread(target=produce, args=[q]) 29 c1 = threading.Thread(target=consume, args=[q]) 30 c2 = threading.Thread(target=consume, args=[q]) 31 p1.start() 32 c1.start() 33 c2.start() 34 35 if __name__ == '__main__': 36 main()
类
爬虫类需要继承多线程类
初始化方法需要继承父类初始化方法
创建对象,直接start就会调用类中run方法
1 # class ConsumeSpider(threading.Thread): 2 # def __init__(self): 3 # super().__init__() 4 # pass 5 # 6 # def run(self): 7 # pass 8 # 9 # c3 = ConsumeSpider() 10 # c3.start()
协程
协程(coroutine):轻量级的线程,不存在上下文切换,能在多个任务之间调度的多任务方式,可以使用yield实现
1 import time, threading 2 3 def task_1(): 4 while True: 5 print('-----1-----', threading.current_thread()) 6 time.sleep(1) 7 yield 8 9 10 def task_2(): 11 while True: 12 print('-----2-----', threading.current_thread()) 13 time.sleep(1) 14 yield 15 16 17 def main(): 18 t1 = task_1() 19 t2 = task_2() 20 while True: 21 next(t1) 22 next(t2) 23 24 25 if __name__ == '__main__': 26 main() 27
请使用手机"扫一扫"x