爬取动物图片源码
import requests
from bs4 import BeautifulSoup
import os, shutil
from threading import Thread
import time
from datetime import datetime
def fun_makedir():
"""
创建文件夹
"""
file_path = os.getcwd() + '/down/' + time.strftime('%Y%m%d%H%M%S', time.localtime())
if os.path.exists(file_path):
shutil.rmtree(file_path)
os.makedirs(file_path)
else:
os.makedirs(file_path)
os.chdir(file_path)
def getmsg(url):
"""
获取图片缩在页面的链接
:param url:
:return:返回一个图片列表:含有图片名称,图片所在页面的链接
"""
pictrues = []
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
pictrue_list = soup.find('div', id='container').findAll('div', class_='box picblock col3')
# print(len(pictrue_list)) #80 本来只有40。原因是findAll('div',class_='box picblock col3')这儿之前没有指定class_
for pictrue in pictrue_list:
pictrue = pictrue.find('a')
pictrue_name = pictrue['alt']
pictrue_url = pictrue['href']
pictrues.append([pictrue_name, pictrue_url])
# print("{:<20s}{:<60s}".format(pictrue_name,pictrue_url))
return pictrues
def save_pic(pic_name, pic_url):
"""
下载图片
:param pic_name:
:param pic_url:
:return:
"""
global count
count =count +1
filename = str(count) + ' ' + pic_name + '.jpg'
pic = requests.get(pic_url)
with open(filename, 'ab') as f:
# f.write(pic) #这种写法是错误的,一定要加content
f.write(pic.content)
print("图片:{}下载成功".format(filename))
def down_pictrue(pictrue_name, pictrue_url):
"""
获取图片链接,下载图片
:param pictrue_name:
:param pictrue_url:
:return:
"""
try:
down_res = requests.get(pictrue_url)
down_res.encoding = 'utf-8'
down_soup = BeautifulSoup(down_res.text, 'html.parser')
down_link = down_soup.find('div', class_='imga').find('a')['href']
pictrue_url = down_link
# print(pictrue_name,pictrue_url)
save_pic(pictrue_name, pictrue_url)
except:
print("{}未获取到链接".format(pictrue_name))
# 主函数
def main():
start_time = datetime.now()
pictrues = []
global count #用于统计图片数量
count=0
url = "http://sc.chinaz.com/tupian/dongwutupian.html"
for i in range(1, 11):
if (i == 1):
url = url
else:
url = "http://sc.chinaz.com/tupian/dongwutupian_{}.html".format(i)
print("collecting message from {}".format(url))
pictrues = getmsg(url)
threads = []
for item in pictrues:
# print(item[0],item[1])
# 创建多线程,线程执行函数为down_pictrue,传递函数所需参数args=(item[0], item[1])
t = Thread(target=down_pictrue, args=(item[0], item[1]))
threads.append(t)
for t in threads:
t.start()
for t in threads:
t.join()
run_time = (datetime.now() - start_time).total_seconds()
print("\n一共下载{}张图片,共用时{}秒".format(count,run_time, end='\t'))
# 程序入口
if __name__ == '__main__':
# 创建文件夹,保存图片
fun_makedir()
# 执行主函数
main()