python抓取不得姐动图(报错 urllib.error.HTTPError: HTTP Error 403: Forbidden)
抓取不得姐动图(报错)
# -*- coding:utf-8 -*- #__author__ :kusy #__content__:文件说明 #__date__:2018/7/23 17:01 import urllib.request import re def getHtml(url): page = urllib.request.urlopen(url) html = page.read() # print(html) return html def getImg(reg,savePath): iCnt = 0 def giveImg(html): imgre = re.compile(reg) imglist = re.findall(imgre, html.decode('utf-8')) nonlocal iCnt for imgurl in imglist: urllib.request.urlretrieve(imgurl, savePath + '%s.gif' % iCnt) iCnt += 1 return giveImg # html = getHtml("http://pic.sogou.com/") # reg = r'"image":"(.+?)"' #sougou reg = r'data-original="(.+?\.gif)"' savePath = 'image/gif/' g = getImg(reg,savePath) for i in range(10): if i >1: print("http://www.budejie.com/" + str(i)) html = getHtml("http://www.budejie.com/" + str(i)) else: html = getHtml("http://www.budejie.com/") g(html)
报错如下
E:\kusy\python\venv\Scripts\python.exe E:/kusy/python/getJpg.py http://www.budejie.com/2 Traceback (most recent call last): File "E:/kusy/python/getJpg.py", line 35, in <module> html = getHtml("http://www.budejie.com/" + str(i)) File "E:/kusy/python/getJpg.py", line 9, in getHtml page = urllib.request.urlopen(url) File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 223, in urlopen return opener.open(url, data, timeout) File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 532, in open response = meth(req, response) File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 642, in http_response 'http', request, response, code, msg, hdrs) File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 570, in error return self._call_chain(*args) File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 504, in _call_chain result = func(*args) File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 650, in http_error_default raise HTTPError(req.full_url, code, msg, hdrs, fp) urllib.error.HTTPError: HTTP Error 403: Forbidden Process finished with exit code 1
百度了下已解决:
# -*- coding:utf-8 -*- #__author__ :kusy #__content__:文件说明 #__date__:2018/7/23 17:01 import urllib.request import re def getHtml(url): # 如果不加上下面的这行出现会出现urllib.error.HTTPError: HTTP Error 403: Forbidden错误 # 主要是由于该网站禁止爬虫导致的,可以在请求加上头信息,伪装成浏览器访问User-Agent,具体的信息可以通过火狐的FireBug插件查询 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} req = urllib.request.Request(url=url,headers=headers) page = urllib.request.urlopen(req) html = page.read() # print(html) return html def getImg(reg,savePath): iCnt = 0 def giveImg(html): imgre = re.compile(reg) imglist = re.findall(imgre, html.decode('utf-8')) nonlocal iCnt for imgurl in imglist: urllib.request.urlretrieve(imgurl, savePath + '%s.gif' % iCnt) iCnt += 1 return giveImg # html = getHtml("http://pic.sogou.com/") # reg = r'"image":"(.+?)"' #sougou reg = r'data-original="(.+?\.gif)"' savePath = 'image/gif/' g = getImg(reg,savePath) for i in range(10): if i >1: print("http://www.budejie.com/" + str(i)) html = getHtml("http://www.budejie.com/" + str(i)) else: html = getHtml("http://www.budejie.com/") g(html)
下载成功
金瓦金銮殿,皇上看不见;
一朝出了午门口,一个鼻子两只手。
金瓦金銮殿,皇上不坐殿;
一朝出了京门口,百姓的事儿牵着走牵着走。