提升爬虫效率之线程池
一、使用Flask模拟阻塞,利用线程池爬取数据
#模拟Flask #pip install flask from flask import Flask,render_template #返回一个模板文件需要导入render_tamplate from time import sleep app = Flask(__name__) @app.route('/bobo') def index_1(): return 'hello world' @app.route('/jay') def index_2(): sleep(2) return render_template('test.html') @app.route('/tom') def index_3(): sleep(2) return render_template('test.html') if __name__ == '__main__': app.run(threaded=True) #括号里面也可以写debug=True 也可以不写
线程池代码:
# 同步代码 # import requests # import time # # urls = [ # 'http://127.0.0.1:5000/jay', # 'http://127.0.0.1:5000/tom', # 'http://127.0.0.1:5000/jay', # 'http://127.0.0.1:5000/tom', # ] # start = time.time() # for url in urls: # page_text = requests.get(url).text # print(len(page_text)) # # print('总耗时',time.time()-start) #异步代码 import requests import time from multiprocessing.dummy import Pool pool = Pool(4) urls = [ 'http://127.0.0.1:5000/jay', 'http://127.0.0.1:5000/tom', 'http://127.0.0.1:5000/jay', 'http://127.0.0.1:5000/tom', ] start = time.time() def get_request(url): page_text = requests.get(url).text print(len(page_text)) def parse(page_text): print('模仿解析') page_text_list = pool.map(get_request,urls) #这个map做数据请求 pool.map(parse,page_text_list) #这个map做数据解析 print('总耗时',time.time()-start)
We are down, but not beaten. tested but not defeated.