用python爬取全网妹子图片【附源码笔记】

   这是晚上没事无聊写的python爬虫小程序,专门爬取妹子图的,养眼用的,嘻嘻!身为程序狗只会这个了!

  废话不多说,代码附上,仅供参考学习!

    

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
功能:爬取妹子图全网妹子图片,可以选择爬取年份,自动分类保存
作者:68喜科技
"""
import requests
from lxml import etree
# import re
import os
# from time import sleep
 
class Meizitu(object):
    """爬取妹子图中的图片"""
    def __init__(self, year):
        self.url = "http://www.mzitu.com/all/"
        self.headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
        self.year = year
 
    # 获取页面
    def get_page(self, url, headers):
        response = requests.get(url, headers=headers)
        return response.content.decode()
 
    # 提取列表页中的urls
    def get_detail_urls_list(self, page_content, year):
        html_content = etree.HTML(page_content)
        year_list = html_content.xpath("//div[@class='year']/text()")
        index = 2019 - year
        # 提取某一年的相关主题的urls
        xpath_var = "//div[@class='year'][{}]/following-sibling::*[1]//p[@class='url']/a/@href".format(index)
        if index <= len(year_list):
            urls_list = html_content.xpath(xpath_var)
            # print(urls_list)
        else:
            urls_list = None
        return urls_list
     
    # 构造保存路径并创建目录
    def save_path(self, detail_html_content, first_img_url, img_name):
        # 构造保存路径
        path_prefix1 = detail_html_content.xpath("//div[@class='currentpath']/a/text()")[1]
        # print(path_prefix1)
        path_prefix2 = first_img_url[20:29]
        # print(path_prefix2)
        save_path = "./妹子图/" + path_prefix1 + path_prefix2 + img_name + "/"
 
        # 如果目录不存在,则创建目录
        if not os.path.exists(save_path):
            os.makedirs(save_path)
         
        return save_path
     
    # 请求和保存图片
    def save_img(self, img_url, img_headers, img_save_path):
        # 请求图片
        img_content = requests.get(img_url, headers=img_headers).content
        # 保存图片
        with open(img_save_path, "wb") as f:
            f.write(img_content)
     
    # 构造图片请求地址
    def img_url(self, first_img_url, img_index):
        if img_index < 10:
            img_url = first_img_url[:32] + "0" + str(img_index) + ".jpg"
        else:
            img_url = first_img_url[:32] + str(img_index) + ".jpg"
        # print(img_url)
        return img_url
     
    # 构造图片的请求头
    def img_headers(self, url, img_index):
        if img_index == 1 :
            refer_url = url
        else:
            refer_url = url + "/" + str(img_index)
        # print(refer_url)
 
        img_headers = {
            # "Accept":"image/webp,image/apng,image/*,*/*;q=0.8",
            # "Accept-Encoding":"gzip, deflate",
            # "Accept-Language":"zh-CN,zh;q=0.9",
            # "Connection":"keep-alive",
            "Host":"i.meizitu.net",
            "Referer":refer_url,
            "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
        }
        # print(img_headers,end="\n\n")
        return img_headers
 
    # 构造每个主题的图片请求地址 并保存
    def get_img_urls(self, url, detail_html_content, first_img_url, img_name, save_path):
        # 每个主题中的图片总数
        img_total_num = int(detail_html_content.xpath("//div[@class='pagenavi']/a/span/text()")[4])
         
        # 构造图片地址 http://i.meizitu.net/2018/02/18c01.jpg
        for img_index in range(1, img_total_num + 1):
            img_url = self.img_url(first_img_url, img_index)
            img_headers = self.img_headers(url, img_index)           
            # 构造图片具体保存路径
            img_save_path = save_path + img_name + str(img_index) + ".jpg"           
            # sleep(10)
            # 请求和保存图片
            self.save_img(img_url, img_headers, img_save_path)
 
    # 获取图片
    def get_image(self, detail_urls_list):
        for url in detail_urls_list:
            detail_page = self.get_page(url, headers=self.headers)
            detail_html_content = etree.HTML(detail_page)
            # 第一页图片地址
            first_img_url = detail_html_content.xpath("//div[@class='main-image']/p/a/img/@src")[0]
            # print(first_img_url)
            # 获取图片保存的名字
            img_name = detail_html_content.xpath("//h2[@class='main-title']/text()")[0]
            # print(img_name)
             
            # 构建保存路径并创建目录
            save_path = self.save_path(detail_html_content, first_img_url, img_name)
 
            # 构建图片请求地址并下载
            self.get_img_urls(url, detail_html_content, first_img_url, img_name, save_path)
 
 
    # 启动爬虫
    def run_spider(self):
        # 获取妹子图中的列表页内容
        page_content = self.get_page(self.url, self.headers)
        # 获取详情页的地址列表
        detail_urls_list = self.get_detail_urls_list(page_content, self.year)
        # 获取图片
        self.get_image(detail_urls_list)
 
if __name__ == "__main__":
    year = int(input("请输入您要爬取的年份:"))
    meizitu = Meizitu(year)
    meizitu.run_spider()

  

    

posted @   圆柱模板  阅读(1743)  评论(0编辑  收藏  举报
编辑推荐:
· .NET 9 new features-C#13新的锁类型和语义
· Linux系统下SQL Server数据库镜像配置全流程详解
· 现代计算机视觉入门之:什么是视频
· 你所不知道的 C/C++ 宏知识
· 聊一聊 操作系统蓝屏 c0000102 的故障分析
阅读排行:
· DeepSeek V3 两周使用总结
· 回顾我的软件开发经历(1)
· C#使用yield关键字提升迭代性能与效率
· 低成本高可用方案!Linux系统下SQL Server数据库镜像配置全流程详解
· 4. 使用sql查询excel内容
点击右上角即可分享
微信分享提示