基于线程池实现异步爬取dummy

基于线程池实现异步爬取dummy

使用multiprocessing.dummy中的Pool池

# 先构建要访问url的列表
import requests

url = 'https://www.qiushibaike.com/text/page/%d/'
urls = []
for page in range(1, 11):
    new_url = format(url % page)
    urls.append(new_url)

# 进行爬取
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}


# 自定义请求函数,访问url都会调用这个函数
# 注意点:必须有一个参数
def get_request(url):
    return requests.get(url, headers=headers).text


from multiprocessing.dummy import Pool

pool = Pool(10)
response_text_list = pool.map(get_request, urls)  # 使用自定义的函数func异步的处理urls列表中的每一个列表元素
print(response_text_list)

同步和线程池异步操作的对比

需要导入的类和模块

from multiprocessing.dummy import Pool
import requests
import time

同步代码

start = time.time()
pool = Pool(3)

urls = ['http://127.0.0.1:5000/bobo','http://127.0.0.1:5000/jay','http://127.0.0.1:5000/tom']
for url in urls:
    page_text = requests.get(url).text
    print(page_text)
print('总耗时:',time.time()-start)

异步代码

start = time.time()
pool = Pool(3)
urls = ['http://127.0.0.1:5000/bobo','http://127.0.0.1:5000/jay','http://127.0.0.1:5000/tom']

def get_request(url):
    return requests.get(url).text

response_list = pool.map(get_request,urls)
print(response_list)

#解析
def parse(page_text):
    print(len(page_text))

pool.map(parse,response_list)
print('总耗时:',time.time()-start)

可以自己搭建简易的flask服务器进行测试代码

posted @ 2020-06-09 12:10  Hedger_Lee  阅读(123)  评论(0编辑  收藏  举报