依赖

pip install requests
pip install BeautifulSoup4
pip install fake_useragent

代码示例

基础版

"""
pip install requests
pip install BeautifulSoup4
pip install -U fake-useragent
pip install fake_useragent

"""

import os
import requests
import time
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
IG_FILE_PATH = os.path.join(BASE_DIR, 'ig')


def init():
    if not os.path.isdir(IG_FILE_PATH):
        os.mkdir(IG_FILE_PATH)

def spider():
    ret = requests.get('https://bing.ioliu.cn/?p=1', headers={'User-Agent': UserAgent().random})
    # print(ret.text)
    bs = BeautifulSoup(ret.text, 'html.parser')
    img_list = bs.find_all(name='div', attrs={"class":"item"})
    for div in img_list:
        img_url = div.find_all(name='a', attrs={"class":"ctrl download"})[0].get("href")
        file_name = img_url.rsplit('=')[1].split('&')[0]
        # print(file_name)
        img_content = requests.get(url=img_url, headers={'User-Agent': UserAgent().random})
        file_path = '{}/{}'.format(IG_FILE_PATH, file_name)
        with open(file_path, 'wb') as f:
            f.write(img_content.content)
        print(img_content.url, 'download done........')


if __name__ == "__main__":
    init()
    start_time = time.time()
    spider()  # 爬取多少页
    end_time = time.time()
    print('总耗时：', end_time - start_time)
    # 代码截止2022年12月29日运行无误

    """
    <div class="item">
        <div class="card progressive">
            <img class="progressive__img progressive--not-loaded" data-progressive="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_800x480.jpg" src="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_800x480.jpg"/>
                <a class="mark" href="/photo/DudhsagarFallsGoa_ZH-CN0466471017?force=home_1"></a>
                <div class="description"><h3>杜德萨加尔瀑布，印度果阿 (© Lucky-photographer/Getty Images)</h3>
                    <p class="calendar"><i class="icon icon-calendar"></i><em class="t">2022-12-16</em></p>
                    <p class="view"><i class="icon icon-eye">
                        </i><em class="t">6167</em>
                    </p>
                </div>
                <div class="options">
                    <a class="ctrl share" href="http://service.weibo.com/share/share.php?url=https://bing.ioliu.cn/photo/DudhsagarFallsGoa_ZH-CN0466471017&amp;appkey=1833831541&amp;pic=https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg&amp;ralateUid=5893653736&amp;title=%23%E5%BF%85%E5%BA%94%E5%A3%81%E7%BA%B8%23%202022-12-16%20%2F%20%23%23%20%E6%9D%9C%E5%BE%B7%E8%90%A8%E5%8A%A0%E5%B0%94%E7%80%91%E5%B8%83%EF%BC%8C%E5%8D%B0%E5%BA%A6%E6%9E%9C%E9%98%BF%20(%C2%A9%20Lucky-photographer%2FGetty%20Images)..." rel="nofollow" target="_blank" title="分享到微博">
                    <i class="icon icon-share"></i>
                        <em class="t">分享</em>
                            </a>
                                <span class="ctrl heart" likes="29" photo="5200">
                                <i class="icon icon-heart"></i>
                                <em class="t">29</em>
                                </span>
                                <a class="ctrl download" href="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_1920x1080.jpg&amp;qlt=100" photo="5200" rel="nofollow" target="_blank" title="DudhsagarFallsGoa_ZH-CN0466471017_1920x1080.jpg">
                                <i class="icon icon-download"></i><em class="t">1920x1080</em></a>
                                <a class="ctrl download" href="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg&amp;qlt=100" photo="5200" rel="nofollow" target="_blank" title="DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg">
                                <i class="icon icon-download"></i><em class="t">UHD</em></a></div></div></div>

    
    """

顺序爬取多页

import os
import time
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
IG_FILE_PATH = os.path.join(BASE_DIR, 'ig')


def init():
    if not os.path.isdir(IG_FILE_PATH):
        os.mkdir(IG_FILE_PATH)


def spider(num):
    for page in range(1, num + 1):
        print(111, page, 'https://bing.ioliu.cn/?p=%s' % page)
        ret = requests.get('https://bing.ioliu.cn/?p=%s' % page, headers={'User-Agent': UserAgent().random})
        bs = BeautifulSoup(ret.text, 'html.parser')
        img_list = bs.find_all(name='div', attrs={"class":"item"})
        for div in img_list:
            img_url = div.find_all(name='a', attrs={"class":"ctrl download"})[0].get("href")
            file_name = img_url.rsplit('=')[1].split('&')[0]
            img_content = requests.get(url=img_url, headers={'User-Agent': UserAgent().random})
            file_path = '{}/{}'.format(IG_FILE_PATH, file_name)
            with open(file_path, 'wb') as f:
                f.write(img_content.content)
            print(img_content.url, 'download done........')


if __name__ == "__main__":
    init()
    start_time = time.time()
    spider(3)  # 爬取多少页
    end_time = time.time()
    print('总耗时：', end_time - start_time)  # 总耗时： 37.38869881629944
    # 代码截止2022年12月29日运行无误

    """
    <div class="item">
        <div class="card progressive">
            <img class="progressive__img progressive--not-loaded" data-progressive="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_800x480.jpg" src="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_800x480.jpg"/>
                <a class="mark" href="/photo/DudhsagarFallsGoa_ZH-CN0466471017?force=home_1"></a>
                <div class="description"><h3>杜德萨加尔瀑布，印度果阿 (© Lucky-photographer/Getty Images)</h3>
                    <p class="calendar"><i class="icon icon-calendar"></i><em class="t">2022-12-16</em></p>
                    <p class="view"><i class="icon icon-eye">
                        </i><em class="t">6167</em>
                    </p>
                </div>
                <div class="options">
                    <a class="ctrl share" href="http://service.weibo.com/share/share.php?url=https://bing.ioliu.cn/photo/DudhsagarFallsGoa_ZH-CN0466471017&amp;appkey=1833831541&amp;pic=https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg&amp;ralateUid=5893653736&amp;title=%23%E5%BF%85%E5%BA%94%E5%A3%81%E7%BA%B8%23%202022-12-16%20%2F%20%23%23%20%E6%9D%9C%E5%BE%B7%E8%90%A8%E5%8A%A0%E5%B0%94%E7%80%91%E5%B8%83%EF%BC%8C%E5%8D%B0%E5%BA%A6%E6%9E%9C%E9%98%BF%20(%C2%A9%20Lucky-photographer%2FGetty%20Images)..." rel="nofollow" target="_blank" title="分享到微博">
                    <i class="icon icon-share"></i>
                        <em class="t">分享</em>
                            </a>
                                <span class="ctrl heart" likes="29" photo="5200">
                                <i class="icon icon-heart"></i>
                                <em class="t">29</em>
                                </span>
                                <a class="ctrl download" href="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_1920x1080.jpg&amp;qlt=100" photo="5200" rel="nofollow" target="_blank" title="DudhsagarFallsGoa_ZH-CN0466471017_1920x1080.jpg">
                                <i class="icon icon-download"></i><em class="t">1920x1080</em></a>
                                <a class="ctrl download" href="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg&amp;qlt=100" photo="5200" rel="nofollow" target="_blank" title="DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg">
                                <i class="icon icon-download"></i><em class="t">UHD</em></a></div></div></div>

    
    """

并发爬取多页

import os
import time
import requests
from threading import Thread
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
IG_FILE_PATH = os.path.join(BASE_DIR, 'ig')


def init():
    if not os.path.isdir(IG_FILE_PATH):
        os.mkdir(IG_FILE_PATH)

def work(page):
    print(111, page, 'https://bing.ioliu.cn/?p=%s' % page)
    ret = requests.get('https://bing.ioliu.cn/?p=%s' % page, headers={'User-Agent': UserAgent().random})
    bs = BeautifulSoup(ret.text, 'html.parser')
    img_list = bs.find_all(name='div', attrs={"class": "item"})
    for div in img_list:
        img_url = div.find_all(name='a', attrs={"class": "ctrl download"})[0].get("href")
        file_name = img_url.rsplit('=')[1].split('&')[0]
        img_content = requests.get(url=img_url, headers={'User-Agent': UserAgent().random})
        file_path = '{}/{}'.format(IG_FILE_PATH, file_name)
        with open(file_path, 'wb') as f:
            f.write(img_content.content)
        print(f'第{page}页的{file_name}爬取完毕.....')


def spider(num):
    for page in range(1, num + 1):
        t = Thread(target=work, args=(page, ))
        t.start()
    t.join()

if __name__ == "__main__":
    init()
    start_time = time.time()
    spider(3)  # 爬取多少页
    end_time = time.time()
    print('总耗时：', end_time - start_time)
    # 代码截止2022年12月29日运行无误

    """
    <div class="item">
        <div class="card progressive">
            <img class="progressive__img progressive--not-loaded" data-progressive="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_800x480.jpg" src="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_800x480.jpg"/>
                <a class="mark" href="/photo/DudhsagarFallsGoa_ZH-CN0466471017?force=home_1"></a>
                <div class="description"><h3>杜德萨加尔瀑布，印度果阿 (© Lucky-photographer/Getty Images)</h3>
                    <p class="calendar"><i class="icon icon-calendar"></i><em class="t">2022-12-16</em></p>
                    <p class="view"><i class="icon icon-eye">
                        </i><em class="t">6167</em>
                    </p>
                </div>
                <div class="options">
                    <a class="ctrl share" href="http://service.weibo.com/share/share.php?url=https://bing.ioliu.cn/photo/DudhsagarFallsGoa_ZH-CN0466471017&amp;appkey=1833831541&amp;pic=https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg&amp;ralateUid=5893653736&amp;title=%23%E5%BF%85%E5%BA%94%E5%A3%81%E7%BA%B8%23%202022-12-16%20%2F%20%23%23%20%E6%9D%9C%E5%BE%B7%E8%90%A8%E5%8A%A0%E5%B0%94%E7%80%91%E5%B8%83%EF%BC%8C%E5%8D%B0%E5%BA%A6%E6%9E%9C%E9%98%BF%20(%C2%A9%20Lucky-photographer%2FGetty%20Images)..." rel="nofollow" target="_blank" title="分享到微博">
                    <i class="icon icon-share"></i>
                        <em class="t">分享</em>
                            </a>
                                <span class="ctrl heart" likes="29" photo="5200">
                                <i class="icon icon-heart"></i>
                                <em class="t">29</em>
                                </span>
                                <a class="ctrl download" href="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_1920x1080.jpg&amp;qlt=100" photo="5200" rel="nofollow" target="_blank" title="DudhsagarFallsGoa_ZH-CN0466471017_1920x1080.jpg">
                                <i class="icon icon-download"></i><em class="t">1920x1080</em></a>
                                <a class="ctrl download" href="https://bing.com/th?id=OHR.DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg&amp;qlt=100" photo="5200" rel="nofollow" target="_blank" title="DudhsagarFallsGoa_ZH-CN0466471017_UHD.jpg">
                                <i class="icon icon-download"></i><em class="t">UHD</em></a></div></div></div>

    
    """

效果展示

that's all

posted @ 2019-03-04 21:33 听雨危楼阅读(300) 评论(0) 收藏举报

刷新页面返回顶部

王战山的学习笔记

非淡泊无以明志，非宁静无以致远

1-Python - 爬取必应壁纸

依赖

代码示例

效果展示

公告