Python3多线程爬取meizitu的图片
python环境:python3
运行环境:win10和linux都可以,其他系统没测
1 安装依赖
pip install requests
pip install lxml
pip install feedparser
2 创建一个新文件夹
3 运行该脚本
python mzitu.py
源码如下:
# -*- coding: UTF-8 –*-
import feedparser
import requests
from lxml import etree
import threading
import random
import os
def get_url():
rss_url = 'https://www.mzitu.com/feed/'
feeds = feedparser.parse(rss_url)
page_url = []
for i in range(20):
page_url.append(feeds.entries[i]['link'])
return page_url
def download(dirname, imgurl):
headers = {
'referer':'https://www.mzitu.com/',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
filename = imgurl.split('/')[-1]
r = requests.get(imgurl, headers = headers, stream=True)
if os.path.exists(dirname):
with open(dirname + '/' + filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=32):
f.write(chunk)
print('下载:%s中' % filename)
else:
os.mkdir(dirname)
with open(dirname + '/' + filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=32):
f.write(chunk)
print('下载:%s中' % filename)
def get_img(url):
r = requests.get(url)
page = etree.HTML(r.text)
span = page.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span')
hs = page.xpath('//h2[@class="main-title"]')
for h in hs:
title = h.text
for a in span:
pages = a.text
try:
for i in range(int(pages)+1):
if i == 1:
pass
else:
imgpage = url + '/' + str(i)
r1 = requests.get(imgpage)
page1 = etree.HTML(r1.text)
x_href = page1.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img')
for href in x_href:
imgurl = href.get('src')
download(title, imgurl)
except KeyboardInterrupt:
pass
except:
pass
def main():
urls = get_url()
threads=[]
for i in range(len(urls)):
t = threading.Thread(target=get_img, args=(urls[0+i],))
threads.append(t)
for i in threads:
i.start()
for i in threads:
i.join()
if __name__ == '__main__':
main()
如果遇到问题,源码请到百度网盘下载;百度网盘 提取码:7pv8
4 升级版(可下载所有组图)
源码如下:
# -*- coding: UTF-8 鈥?-
import feedparser
import requests
from lxml import etree
import threading
import random
import os
def get_url2():
rss_url = 'https://www.mzitu.com/all/'
r = requests.get(rss_url)
page = etree.HTML(r.text)
result =page.xpath('/html/body/div[2]/div[1]/div[2]/ul/li/p[2]/a')
print('鏈?d缁勫浘'%len(result))
page_url = []
for x in result:
page_url.append(x.get('href'))
#print(x.get('href'))
return page_url
def download(dirname, imgurl):
headers = {
'referer':'https://www.mzitu.com/',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
filename = imgurl.split('/')[-1]
r = requests.get(imgurl, headers = headers, stream=True)
if os.path.exists(dirname):
with open(dirname + '/' + filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=32):
f.write(chunk)
print('涓嬭浇:%s涓? % filename)
else:
os.mkdir(dirname)
with open(dirname + '/' + filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=32):
f.write(chunk)
print('涓嬭浇:%s涓? % filename)
def get_img(url):
r = requests.get(url)
page = etree.HTML(r.text)
span = page.xpath('/html/body/div[2]/div[1]/div[4]/a[5]/span')
hs = page.xpath('//h2[@class="main-title"]')
for h in hs:
title = h.text
for a in span:
pages = a.text
try:
for i in range(int(pages)+1):
if i == 1:
pass
else:
imgpage = url + '/' + str(i)
r1 = requests.get(imgpage)
page1 = etree.HTML(r1.text)
x_href = page1.xpath('/html/body/div[2]/div[1]/div[3]/p/a/img')
for href in x_href:
imgurl = href.get('src')
download(title, imgurl)
except KeyboardInterrupt:
pass
except:
pass
def main():
urls = get_url2()
threads=[]
for i in range(len(urls)):
t = threading.Thread(target=get_img, args=(urls[0+i],))
threads.append(t)
for i in threads:
i.start()
for i in threads:
i.join()
if __name__ == '__main__':
main()
如果遇到问题,源码请到百度网盘下载;百度网盘 提取码:nxoo
注意:经测试,4 升级版在运行时,会大量占用内存,内存小的电脑估计抗不住。。