多线程

多线程

在爬取数据时,是IO密集的, 可以采用多线程提高效率。

线程池的使用场景:应用在所有耗时的操作中

concurrent.futures 函数库有一个 ThreadPoolExecutor 类可以被用来完成这个任务。

爬取站长素材高清图片

#! /usr/bin/env python
# -*- coding: utf-8 -*-

from lxml import etree
from concurrent.futures import ThreadPoolExecutor
import random
import requests

img_url_XP = '//div[@id="container"]/div/p/a/@href'
img_name_XP = '//div[@id="container"]/div/p/a/@alt'
img_download_url_XP = '//div[@class="downbody"]//a/@href'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
    'Connection': 'close',
}


def get_page(n: int):
    if n <= 1:
        return 'http://sc.chinaz.com/tupian/index.html'
    else:
        return f'http://sc.chinaz.com/tupian/index_{n}.html'


img_download_url_list = []
start_page = 1
end_page = 3
for i in range(start_page, end_page + 1):
    url = get_page(i)
    res = requests.get(url, headers=headers)
    res.encoding = 'utf8'
    tree = etree.HTML(res.text)
    img_url_list = tree.xpath(img_url_XP)
    img_name_list = tree.xpath(img_name_XP)
    img_name_list = [i + '.rar' for i in img_name_list]
    for key, img_url in enumerate(img_url_list):
        res2 = requests.get(img_url, headers=headers)
        tree2 = etree.HTML(res2.text)
        download_list = tree2.xpath(img_download_url_XP)
        img_download_url = random.choice(download_list)
        img_download_url_list.append([img_download_url, img_name_list[key]])

print(img_download_url_list)


def get_data(args):
    """

    :param args: (url,name)
    :return: (content,name)
    """
    res = requests.get(args[0], headers=headers)
    print(f'获取到数据:{args[1]}')
    return res.content, args[1]


def save_data(args):
    """

    :param args: (content,name)
    :return:
    """
    with open(f'static/zhanzhang/{args[1]}', 'wb') as f:
        f.write(args[0])
    print(f'已保存数据:{args[1]}')


# 线程池的使用场景:应用在所有耗时的操作中
thread_pool = ThreadPoolExecutor(max_workers=10)
map_res = thread_pool.map(get_data, img_download_url_list)  # 异步执行,map自带join功能
thread_pool.map(save_data, map_res)  # 异步执行,map自带join功能

posted @ 2019-08-11 16:25  写bug的日子  阅读(78)  评论(0编辑  收藏  举报