爬取伯乐的全部图片 -《狗嗨默示录》-

#!/usr/bin/env python
# -*- coding:UTF-8 -*-

import urllib.request
import urllib.error
import urllib.parse
import time
import socket
import re
from bs4 import BeautifulSoup

#获取主页源码
index_url = "http://www.jobbole.com" #主页

class JobBole():
def __init__(self): #初始化构造函数
self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
self.html_data = [] #放置数据信息
def get_html(self):
req = urllib.request.Request(index_url) #用地址创建对象
req.add_header('User_Agent',self.user_agent)
try:
my_data = urllib.request.urlopen(req).read().decode('utf-8') #打开网页,获取源码
#print(my_data)
self.html_data.append(my_data)
except urllib.error.URLError as err:
if hasattr(err,'reason'): #判断异常是否存在
print(u"链接失败",err.reason)
return str(self.html_data)
#html = JobBole().get_html()

#获取超链接
class GetHref():
def __init__(self):
self.html = JobBole().get_html() #获取源码
#获取产品超链接
def get_hrefurl(self):
hrefs = []
imgs = []
img_label = 0
img_err = 0
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
soup = BeautifulSoup(self.html, 'html.parser')
href_source = soup.find_all('a') # .replace(',','\n')
# print(href_source)
for item in href_source:
data_list = item.get('href') #获取href内容
if not data_list is None and 'http' in data_list: # 过滤无效href
href = data_list
hrefs.append(href)
url_req = urllib.request.Request(href) #创建请求对象
url_req.add_header('User_Agent', user_agent)
try:
href_data = urllib.request.urlopen(url_req).read().decode('utf-8')#打开网页,获取源码
time.sleep(1) # 延迟响应1秒
socket.setdefaulttimeout(60) # 超时设置
img_soup = BeautifulSoup(href_data, 'html.parser')
img_source = img_soup.find_all('img') #查找img标签
for item_img in img_source:
img_href = item_img.get('src') #获取src内容
if not img_href is None and 'http' in img_href:
img_url = img_href
imgs.append(img_url)
print(img_url)
try:
urllib.request.urlretrieve(img_url,'D:\学习资料\Workspace\Mywork\picture\%s.jpg' % img_label)
img_label += 1
except UnicodeEncodeError:
urllib.request.urlretrieve(urllib.parse.quote(img_url, safe=':/?='),'D:\学习资料\Workspace\Mywork\picture\%s.jpg' % img_label)
except urllib.error.URLError as err_img:
if hasattr(err_img, 'reason'): # 判断异常是否存在
img_err += 1
print(u"下载错误,失败%s次" % img_err, err_img.reason)
except urllib.error.URLError as err:
if hasattr(err, 'reason'): # 判断异常是否存在
print(u"链接失败", err.reason)

data = GetHref().get_hrefurl()
posted @ 2017-08-04 14:37  李·狗嗨  阅读(164)  评论(0编辑  收藏  举报