CSDN博客地址

Python--爬取彼岸网站的图片

爬取彼岸网站上图片信息,并将图片下载下来分类保存

思路: 先获取不同类别的链接信息,再获取不同类别图片分页的链接,进入图片详情页面获取图片下载地址,下载图片并分类保存

代码如下:

# encoding:utf-8
import requests
from lxml import etree
import os, time

header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"}
base_url = "http://pic.netbian.com/"

def send_request(url):
    # 发送请求函数
    time.sleep(2)
    response = requests.get(url=url, headers=header).content
    return response

def get_catrgories_link():
    resp = send_request(base_url)
    html = etree.HTML(resp)
    links = html.xpath("//div[@class='classify clearfix']/a")
    for link in links:
        lik = base_url + link.xpath("./@href")[0]
        title = link.xpath("./@title")[0]
        name_link(title, lik)

def name_link(dir_name, link):
    # 抓取前4页面的图片
    for i in range(1, 5):
        if i == 1:
            url = link
        else:
            url = link + "index_" + str(i) + ".html"
        resp = send_request(url)
        html = etree.HTML(resp)
        links = html.xpath("//div[@class='slist']/ul/li")
        for lik in links:
            img_name = lik.xpath("./a/img/@alt")[0]
            img_link = base_url + lik.xpath("./a/@href")[0]
            img_name_url(dir_name, img_name, img_link)

def img_name_url(dir_name, img_name, img_link):
    resp = send_request(img_link)
    html = etree.HTML(resp)
    image_url = base_url + html.xpath("//*[@id='img']/img/@src")[0]
    download(dir_name, img_name, image_url)

count = 1
def download(dir_name, img_name, image_url):
    # 下载图片
    global count
    path = "彼岸图库/{}".format(dir_name)
    if not os.path.exists(path):
        os.makedirs(path)
        print('-------[{}]文件夹已经创建成功,开始下载图片-------'.format(img_name))
    print('正在下载{}, 这是第{}张图片'.format(img_name, count))
    rep = send_request(image_url)
    with open('彼岸图库/{}/{}.jpg'.format(dir_name, img_name), 'wb')as f:
        count += 1
        f.write(rep)
        print('{}已经成功下载, 这是第{}张图片'.format(img_name, count))

get_catrgories_link()

 

posted @ 2020-05-01 21:29  Yi_warmth  阅读(659)  评论(0编辑  收藏  举报
CSDN博客地址