1
import gevent
from gevent import monkey #从gevnet库里导入monkey模块
monkey.patch_all() #能把程序变成协作式运行,就是可以帮助程序实现异步
from gevent.queue import Queue
from bs4 import BeautifulSoup
import os
import time,requests
if not os.path.exists('./imgs1/'):
os.mkdir('./imgs1/')
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}
img_url = [] #page页追加进去
start = time.time()
for p in range(1,7):
if p == 1:
url = 'http://www.meizitu.net.cn/'
img_url.append(url)
else:
url = f'http://www.meizitu.net.cn/page/{p}/'
img_url.append(url)
print(f"第{p}页")
print(img_url)
start = time.time()
work = Queue()#创建队列对象,并赋值给work。
for url in img_url:
work.put_nowait(url)#用put_nowait()函数可以把网址都放进队列里
def crawler():
count = 0
while not work.empty():#当队列不是空的时候,就执行下面的程序。
url = work.get_nowait()#用get_nowait()函数可以把队列里的网址都取出。
r = requests.get(url,headers=HEADERS).text
soup = BeautifulSoup(r,'lxml')
div_list = soup.find_all('div',class_='thumb')
for src in div_list:
src_url = src.find('img')['src']
url_title = (src.find('div').text).strip()
ret = requests.get(src_url,headers=HEADERS)
try:
count += 1
name = count
with open(f'./imgs2/{url_title}.jpg','wb')as f:
f.write(ret.content)
print(f'{url_title}下载成功')
except:
print(f"{url_title}爬取失败")
task_list = []
for x in range(80):#相当于创建了几个爬虫
task = gevent.spawn(crawler) #gevent.spawn()创建任务
task_list.append(task)
gevent.joinall(task_list) #gevent.joinall()执行任务
end = time.time()
print(end-start)
2