爬取图片low版

import re
import requests
import random
import time
import os



class GetPhoto(object):
    def __init__(self, url1, class_):
        self.agent = "user_agents.txt"
        self.head_url_list = {}
        self.pic_item_url = []
        self.pic_url_list = []
        self.page_url_list = []         #
        self.url = url1
        self.class_ = class_
        with open(self.agent, 'r', encoding='utf-8') as f:
            agents = f.readlines()
            self.agents = list(map(lambda x: {"user-agent": x.strip()}, agents))


    def chartset(self, rsp):
        """
        解决中文乱码问题
        :param rsp:
        :return:
        """
        _chart = requests.utils.get_encoding_from_headers(rsp.headers)
        if _chart == 'ISO-8859-1':
            rsp.encoding = requests.utils.get_encodings_from_content(rsp.text)[0]




    # def get_first_url(self):
    #     rsp = requests.get(self.url, headers=random.choice(self.agents))
    #     self.chartset(rsp)
    #     content = rsp.text
    #     pattern = r'<div class="nav both">.*?<!-- top end -->'
    #     com = re.compile(pattern, re.S)
    #     content_info = com.findall(content)
    #     pattern2 = 'href="(http://(?:[a-zA-Z0-9]+\.){1,2}[a-zA-Z]{2,6}.*?)".*?<span>(.+?)</span>'
    #     com2 = re.compile(pattern2, re.S)
    #     self.head_url_list.update(map(lambda x:x[::-1],com2.findall(content_info[0])))

    def get_page_urllist(self):
        print('开始获取首页下方链接列表url！')
        rsp = requests.get(self.url, headers=random.choice(self.agents))
        self.chartset(rsp)
        content = rsp.text
        pattern = r'<div class="page both">.*?</div>'
        com = re.compile(pattern, re.S)
        content_info = com.findall(content)
        pattern2 = r"href='(.*?)'"
        com2 = re.compile(pattern2, re.S)
        self.page_url_list.append(self.url)
        self.page_url_list.extend(list(map(lambda x: url.rsplit("/", 1)[0]+"/"+x, com2.findall(content_info[0]))))
        print("首页下方链接列表url获取完成！")
        print(self.page_url_list)

    def get_sub_url(self):
        self.get_page_urllist()
        print("开始获取整页图片项目链接：")
        for i in self.page_url_list[:10]:
            try:
                rsp = requests.get(i, headers=random.choice(self.agents))
            except Exception as e:
                with open('error.log', 'a', encoding='utf-8') as f:
                    new_time = time.strftime('%Y-%m-%d %H:%M:%S')
                    f.write('{} {} {}+'.format(new_time, i, e.args[0]))
                print('访问失败，暂停10s!')
                time.sleep(10)
                return
            self.chartset(rsp)
            content = rsp.text
            pattern = r'<div class="imgList2">.*?</div>'
            com = re.compile(pattern, re.S)
            content_info = com.findall(content)
            # print(content_info)
            pattern2 = r'href="(?P<url>http://(?:[\w]+\.){1,2}[a-zA-Z]{2,6}.*?)".*?target="_blank" title="(.*?)">'
            com2 = re.compile(pattern2, re.S)
            sub_url_list = com2.findall(content_info[0])
            self.pic_item_url.append(sub_url_list)
            print(sub_url_list)
            time.sleep(random.randint(1, 3))                 #延时1-3秒
        print("获取所有页面图片项目完成！")

    def get_pic_url(self, url2):
        rsp = requests.get(url2, headers=random.choice(self.agents))
        self.chartset(rsp)
        content = rsp.text
        pattern = r'<div class="page".*?</div>'
        com = re.compile(pattern, re.S)
        content_info = com.findall(content)
        pattern2 = r"<a href='([^#]+?)'>"
        com2 = re.compile(pattern2, re.S)
        pic_url_list = list(map(lambda x: url2.rsplit("/",1)[0]+"/"+x, com2.findall(content_info[0])))
        return pic_url_list

    def get_pic(self, pic_page_url, dirname, count):
        try:
            rsp = requests.get(pic_page_url, headers=random.choice(self.agents))
        except Exception as e:
            with open('error.log', 'a', encoding='utf-8') as f:
                new_time = time.strftime('%Y-%m-%d %H:%M:%S')
                f.write('{} {} {}\n'.format(new_time, pic_page_url, e.args[0]))
            print('访问失败，暂停30s!')
            time.sleep(30)
            print('下载继续')
            return
        self.chartset(rsp)
        content = rsp.text
        # pattern = r"<p align=\"center\" id=\"contents\">.*?img\ssrc='(.*?)'"
        pattern = '<p align="center">.*?src="(.*?)"'
        # print(content)
        com = re.compile(pattern, re.S)
        pic_url = com.findall(content)
        image_name = dirname + str(count) + '.jpg'
        if pic_url:
            try:
                pic_rsp = requests.get(pic_url[0], headers=random.choice(self.agents))
            except Exception as e:
                with open('error.log', 'a', encoding='utf-8') as f:
                    new_time = time.strftime('%Y-%m-%d %H:%M:%S')
                    f.write('{} {} {}\n'.format(new_time, pic_url[0], e.args[0]))
                print('访问失败，暂停30s!')
                time.sleep(30)
                print('下载继续')
                return
            img_dirpath = os.path.join(self.class_, dirname)
            if not os.path.exists(img_dirpath):
                os.makedirs(img_dirpath)
            img_path = os.path.join(img_dirpath, image_name)
            # print(img_dirpath)
            # print(image_name)
            # print(img_path)
            # return
            with open(img_path, 'wb') as f:
                f.write(pic_rsp.content)
            print('{}下载完成!'.format(image_name))
        else:
            print('{}下载失败!'.format(image_name))
        time.sleep(random.randint(1, 3))

    def download(self):
        self.get_sub_url()
        for i in self.pic_item_url[:3]:
            for z in i:
                pic_item, dir_name = z
                lst = self.get_pic_url(pic_item)
                print("开始下载{}系列!".format(dir_name))
                for index, j in enumerate(lst, 1):
                    self.get_pic(j, dir_name, index)
                print("{}系列下载完成!".format(dir_name))



if __name__ == '__main__':
    url = 'http://www.5442.com/youxi/'
    pic = GetPhoto(url, '游戏壁纸')
    pic.download()
posted on 2018-08-23 15:01 Louiszj 阅读(121) 评论(0) 编辑收藏举报
刷新页面返回顶部
Louiszj

爬取图片low版

导航

公告