爬取动物图片源码

爬取动物图片源码

import requests
from bs4 import BeautifulSoup
import os, shutil
from threading import Thread
import time
from datetime import datetime

def fun_makedir():
    """
    创建文件夹
    """
    file_path = os.getcwd() + '/down/' + time.strftime('%Y%m%d%H%M%S', time.localtime())
    if os.path.exists(file_path):
        shutil.rmtree(file_path)
        os.makedirs(file_path)
    else:
        os.makedirs(file_path)
    os.chdir(file_path)


def getmsg(url):
    """
    获取图片缩在页面的链接
    :param url:
    :return:返回一个图片列表:含有图片名称,图片所在页面的链接
    """
    pictrues = []
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    pictrue_list = soup.find('div', id='container').findAll('div', class_='box picblock col3')
    # print(len(pictrue_list))  #80  本来只有40。原因是findAll('div',class_='box picblock col3')这儿之前没有指定class_
    for pictrue in pictrue_list:
        pictrue = pictrue.find('a')
        pictrue_name = pictrue['alt']
        pictrue_url = pictrue['href']
        pictrues.append([pictrue_name, pictrue_url])
        # print("{:<20s}{:<60s}".format(pictrue_name,pictrue_url))
    return pictrues


def save_pic(pic_name, pic_url):
    """
    下载图片
    :param pic_name:
    :param pic_url:
    :return:
    """
    global count
    count =count +1
    filename = str(count) + ' ' + pic_name + '.jpg'
    pic = requests.get(pic_url)
    with open(filename, 'ab') as f:
        # f.write(pic)  #这种写法是错误的,一定要加content
        f.write(pic.content)
    print("图片:{}下载成功".format(filename))

def down_pictrue(pictrue_name, pictrue_url):
    """
    获取图片链接,下载图片
    :param pictrue_name:
    :param pictrue_url:
    :return:
    """
    try:
        down_res = requests.get(pictrue_url)
        down_res.encoding = 'utf-8'
        down_soup = BeautifulSoup(down_res.text, 'html.parser')
        down_link = down_soup.find('div', class_='imga').find('a')['href']
        pictrue_url = down_link
        # print(pictrue_name,pictrue_url)

        save_pic(pictrue_name, pictrue_url)
    except:
        print("{}未获取到链接".format(pictrue_name))

# 主函数
def main():
    start_time = datetime.now()
    pictrues = []
    global count #用于统计图片数量
    count=0

    url = "http://sc.chinaz.com/tupian/dongwutupian.html"
    for i in range(1, 11):
        if (i == 1):
            url = url
        else:
            url = "http://sc.chinaz.com/tupian/dongwutupian_{}.html".format(i)
        print("collecting message from {}".format(url))

        pictrues = getmsg(url)
        threads = []
        for item in pictrues:
            # print(item[0],item[1])
            # 创建多线程,线程执行函数为down_pictrue,传递函数所需参数args=(item[0], item[1])
            t = Thread(target=down_pictrue, args=(item[0], item[1]))
            threads.append(t)
        for t in threads:
            t.start()
        for t in threads:
            t.join()

    run_time = (datetime.now() - start_time).total_seconds()
    print("\n一共下载{}张图片,共用时{}秒".format(count,run_time, end='\t'))

# 程序入口
if __name__ == '__main__':
    # 创建文件夹,保存图片
    fun_makedir()
    # 执行主函数
    main()

posted @ 2020-04-27 17:22  yxmichael  阅读(272)  评论(0编辑  收藏  举报