1-Python - 爬取必应壁纸
依赖
pip install requests
pip install BeautifulSoup4
pip install fake_useragent
代码示例
基础版
"""
pip install requests
pip install BeautifulSoup4
pip install -U fake-useragent
pip install fake_useragent
"""
import os
import requests
import time
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
IG_FILE_PATH = os.path.join(BASE_DIR, 'ig')
def init():
if not os.path.isdir(IG_FILE_PATH):
os.mkdir(IG_FILE_PATH)
def spider():
ret = requests.get('https://bing.ioliu.cn/?p=1', headers={'User-Agent': UserAgent().random})
# print(ret.text)
bs = BeautifulSoup(ret.text, 'html.parser')
img_list = bs.find_all(name='div', attrs={"class":"item"})
for div in img_list:
img_url = div.find_all(name='a', attrs={"class":"ctrl download"})[0].get("href")
file_name = img_url.rsplit('=')[1].split('&')[0]
# print(file_name)
img_content = requests.get(url=img_url, headers={'User-Agent': UserAgent().random})
file_path = '{}/{}'.format(IG_FILE_PATH, file_name)
with open(file_path, 'wb') as f:
f.write(img_content.content)
print(img_content.url, 'download done........')
if __name__ == "__main__":
init()
start_time = time.time()
spider() # 爬取多少页
end_time = time.time()
print('总耗时:', end_time - start_time)
# 代码截止2022年12月29日运行无误
"""
<div class="item">
<div class="card progressive">
<img class="progressive__img progressive--not-loaded" data-progressive="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_800x480.jpg" src="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_800x480.jpg"/>
<a class="mark" href="/photo/DudhsagarFallsGoa_ZH-CN0466471017?force=home_1"></a>
<div class="description"><h3>杜德萨加尔瀑布,印度果阿 (© Lucky-photographer/Getty Images)</h3>
<p class="calendar"><i class="icon icon-calendar"></i><em class="t">2022-12-16</em></p>
<p class="view"><i class="icon icon-eye">
</i><em class="t">6167</em>
</p>
</div>
<div class="options">
<a class="ctrl share" href="http://service.weibo.com/share/share.php?url=https://bing.ioliu.cn/photo/DudhsagarFallsGoa_ZH-CN0466471017&appkey=1833831541&pic=https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg&ralateUid=5893653736&title=%23%E5%BF%85%E5%BA%94%E5%A3%81%E7%BA%B8%23%202022-12-16%20%2F%20%23%23%20%E6%9D%9C%E5%BE%B7%E8%90%A8%E5%8A%A0%E5%B0%94%E7%80%91%E5%B8%83%EF%BC%8C%E5%8D%B0%E5%BA%A6%E6%9E%9C%E9%98%BF%20(%C2%A9%20Lucky-photographer%2FGetty%20Images)..." rel="nofollow" target="_blank" title="分享到微博">
<i class="icon icon-share"></i>
<em class="t">分享</em>
</a>
<span class="ctrl heart" likes="29" photo="5200">
<i class="icon icon-heart"></i>
<em class="t">29</em>
</span>
<a class="ctrl download" href="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_1920x1080.jpg&qlt=100" photo="5200" rel="nofollow" target="_blank" title="DudhsagarFallsGoa_ZH-CN0466471017_1920x1080.jpg">
<i class="icon icon-download"></i><em class="t">1920x1080</em></a>
<a class="ctrl download" href="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg&qlt=100" photo="5200" rel="nofollow" target="_blank" title="DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg">
<i class="icon icon-download"></i><em class="t">UHD</em></a></div></div></div>
"""
顺序爬取多页
import os
import time
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
IG_FILE_PATH = os.path.join(BASE_DIR, 'ig')
def init():
if not os.path.isdir(IG_FILE_PATH):
os.mkdir(IG_FILE_PATH)
def spider(num):
for page in range(1, num + 1):
print(111, page, 'https://bing.ioliu.cn/?p=%s' % page)
ret = requests.get('https://bing.ioliu.cn/?p=%s' % page, headers={'User-Agent': UserAgent().random})
bs = BeautifulSoup(ret.text, 'html.parser')
img_list = bs.find_all(name='div', attrs={"class":"item"})
for div in img_list:
img_url = div.find_all(name='a', attrs={"class":"ctrl download"})[0].get("href")
file_name = img_url.rsplit('=')[1].split('&')[0]
img_content = requests.get(url=img_url, headers={'User-Agent': UserAgent().random})
file_path = '{}/{}'.format(IG_FILE_PATH, file_name)
with open(file_path, 'wb') as f:
f.write(img_content.content)
print(img_content.url, 'download done........')
if __name__ == "__main__":
init()
start_time = time.time()
spider(3) # 爬取多少页
end_time = time.time()
print('总耗时:', end_time - start_time) # 总耗时: 37.38869881629944
# 代码截止2022年12月29日运行无误
"""
<div class="item">
<div class="card progressive">
<img class="progressive__img progressive--not-loaded" data-progressive="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_800x480.jpg" src="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_800x480.jpg"/>
<a class="mark" href="/photo/DudhsagarFallsGoa_ZH-CN0466471017?force=home_1"></a>
<div class="description"><h3>杜德萨加尔瀑布,印度果阿 (© Lucky-photographer/Getty Images)</h3>
<p class="calendar"><i class="icon icon-calendar"></i><em class="t">2022-12-16</em></p>
<p class="view"><i class="icon icon-eye">
</i><em class="t">6167</em>
</p>
</div>
<div class="options">
<a class="ctrl share" href="http://service.weibo.com/share/share.php?url=https://bing.ioliu.cn/photo/DudhsagarFallsGoa_ZH-CN0466471017&appkey=1833831541&pic=https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg&ralateUid=5893653736&title=%23%E5%BF%85%E5%BA%94%E5%A3%81%E7%BA%B8%23%202022-12-16%20%2F%20%23%23%20%E6%9D%9C%E5%BE%B7%E8%90%A8%E5%8A%A0%E5%B0%94%E7%80%91%E5%B8%83%EF%BC%8C%E5%8D%B0%E5%BA%A6%E6%9E%9C%E9%98%BF%20(%C2%A9%20Lucky-photographer%2FGetty%20Images)..." rel="nofollow" target="_blank" title="分享到微博">
<i class="icon icon-share"></i>
<em class="t">分享</em>
</a>
<span class="ctrl heart" likes="29" photo="5200">
<i class="icon icon-heart"></i>
<em class="t">29</em>
</span>
<a class="ctrl download" href="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_1920x1080.jpg&qlt=100" photo="5200" rel="nofollow" target="_blank" title="DudhsagarFallsGoa_ZH-CN0466471017_1920x1080.jpg">
<i class="icon icon-download"></i><em class="t">1920x1080</em></a>
<a class="ctrl download" href="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg&qlt=100" photo="5200" rel="nofollow" target="_blank" title="DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg">
<i class="icon icon-download"></i><em class="t">UHD</em></a></div></div></div>
"""
并发爬取多页
import os
import time
import requests
from threading import Thread
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
IG_FILE_PATH = os.path.join(BASE_DIR, 'ig')
def init():
if not os.path.isdir(IG_FILE_PATH):
os.mkdir(IG_FILE_PATH)
def work(page):
print(111, page, 'https://bing.ioliu.cn/?p=%s' % page)
ret = requests.get('https://bing.ioliu.cn/?p=%s' % page, headers={'User-Agent': UserAgent().random})
bs = BeautifulSoup(ret.text, 'html.parser')
img_list = bs.find_all(name='div', attrs={"class": "item"})
for div in img_list:
img_url = div.find_all(name='a', attrs={"class": "ctrl download"})[0].get("href")
file_name = img_url.rsplit('=')[1].split('&')[0]
img_content = requests.get(url=img_url, headers={'User-Agent': UserAgent().random})
file_path = '{}/{}'.format(IG_FILE_PATH, file_name)
with open(file_path, 'wb') as f:
f.write(img_content.content)
print(f'第{page}页的{file_name}爬取完毕.....')
def spider(num):
for page in range(1, num + 1):
t = Thread(target=work, args=(page, ))
t.start()
t.join()
if __name__ == "__main__":
init()
start_time = time.time()
spider(3) # 爬取多少页
end_time = time.time()
print('总耗时:', end_time - start_time)
# 代码截止2022年12月29日运行无误
"""
<div class="item">
<div class="card progressive">
<img class="progressive__img progressive--not-loaded" data-progressive="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_800x480.jpg" src="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_800x480.jpg"/>
<a class="mark" href="/photo/DudhsagarFallsGoa_ZH-CN0466471017?force=home_1"></a>
<div class="description"><h3>杜德萨加尔瀑布,印度果阿 (© Lucky-photographer/Getty Images)</h3>
<p class="calendar"><i class="icon icon-calendar"></i><em class="t">2022-12-16</em></p>
<p class="view"><i class="icon icon-eye">
</i><em class="t">6167</em>
</p>
</div>
<div class="options">
<a class="ctrl share" href="http://service.weibo.com/share/share.php?url=https://bing.ioliu.cn/photo/DudhsagarFallsGoa_ZH-CN0466471017&appkey=1833831541&pic=https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg&ralateUid=5893653736&title=%23%E5%BF%85%E5%BA%94%E5%A3%81%E7%BA%B8%23%202022-12-16%20%2F%20%23%23%20%E6%9D%9C%E5%BE%B7%E8%90%A8%E5%8A%A0%E5%B0%94%E7%80%91%E5%B8%83%EF%BC%8C%E5%8D%B0%E5%BA%A6%E6%9E%9C%E9%98%BF%20(%C2%A9%20Lucky-photographer%2FGetty%20Images)..." rel="nofollow" target="_blank" title="分享到微博">
<i class="icon icon-share"></i>
<em class="t">分享</em>
</a>
<span class="ctrl heart" likes="29" photo="5200">
<i class="icon icon-heart"></i>
<em class="t">29</em>
</span>
<a class="ctrl download" href="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_1920x1080.jpg&qlt=100" photo="5200" rel="nofollow" target="_blank" title="DudhsagarFallsGoa_ZH-CN0466471017_1920x1080.jpg">
<i class="icon icon-download"></i><em class="t">1920x1080</em></a>
<a class="ctrl download" href="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg&qlt=100" photo="5200" rel="nofollow" target="_blank" title="DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg">
<i class="icon icon-download"></i><em class="t">UHD</em></a></div></div></div>
"""
效果展示
that's all