爬取伯乐的全部图片 -《狗嗨默示录》-

#!/usr/bin/env python
# -*- coding:UTF-8 -*-

import urllib.request
import urllib.error
import urllib.parse
import time
import socket
import re
from bs4 import BeautifulSoup

#获取主页源码
index_url = "http://www.jobbole.com"  #主页

class JobBole():
    def __init__(self): #初始化构造函数
        self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
        self.html_data = [] #放置数据信息
    def get_html(self):
        req = urllib.request.Request(index_url) #用地址创建对象
        req.add_header('User_Agent',self.user_agent)
        try:
            my_data = urllib.request.urlopen(req).read().decode('utf-8') #打开网页，获取源码
            #print(my_data)
            self.html_data.append(my_data)
        except urllib.error.URLError as err:
            if hasattr(err,'reason'): #判断异常是否存在
                print(u"链接失败",err.reason)
        return str(self.html_data)
#html = JobBole().get_html()

#获取超链接
class GetHref():
    def __init__(self):
        self.html = JobBole().get_html() #获取源码
    #获取产品超链接
    def get_hrefurl(self):
        hrefs = []
        imgs = []
        img_label = 0
        img_err = 0
        user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
        soup = BeautifulSoup(self.html, 'html.parser')
        href_source = soup.find_all('a')  # .replace(',','\n')
        # print(href_source)
        for item in href_source:
            data_list = item.get('href') #获取href内容
            if not data_list is None and 'http' in data_list:  # 过滤无效href
                href = data_list
                hrefs.append(href)
                url_req = urllib.request.Request(href) #创建请求对象
                url_req.add_header('User_Agent', user_agent)
                try:
                    href_data = urllib.request.urlopen(url_req).read().decode('utf-8')#打开网页，获取源码
                    time.sleep(1)  # 延迟响应1秒
                    socket.setdefaulttimeout(60)  # 超时设置
                    img_soup = BeautifulSoup(href_data, 'html.parser')
                    img_source = img_soup.find_all('img') #查找img标签
                    for item_img in img_source:
                        img_href = item_img.get('src') #获取src内容
                        if not img_href is None and 'http' in img_href:
                            img_url = img_href
                            imgs.append(img_url)
                            print(img_url)
                            try:
                                urllib.request.urlretrieve(img_url,'D:\学习资料\Workspace\Mywork\picture\%s.jpg' % img_label)
                                img_label += 1
                            except UnicodeEncodeError:
                                urllib.request.urlretrieve(urllib.parse.quote(img_url, safe=':/?='),'D:\学习资料\Workspace\Mywork\picture\%s.jpg' % img_label)
                            except urllib.error.URLError as err_img:
                                if hasattr(err_img, 'reason'):  # 判断异常是否存在
                                    img_err += 1
                                    print(u"下载错误，失败%s次" % img_err, err_img.reason)
                except urllib.error.URLError as err:
                    if hasattr(err, 'reason'):  # 判断异常是否存在
                        print(u"链接失败", err.reason)

data = GetHref().get_hrefurl()
posted @ 2017-08-04 14:37 李·狗嗨阅读(164) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部
李·狗嗨

爬取伯乐的全部图片 -《狗嗨默示录》-

公告