Python3抓取网络图片
第一种、普通抓取方式
不带头部信息,网站没有任何反爬措施可使用。如 http://pic.ziweidan.com
#-*- encoding:utf-8 -*- """ @desc 抓取普通网络图片 @author Sanplit """ import os import re import urllib.request input_url = input('Please input your url: ') # input_url = 'http://pic.ziweidan.com' #根据给定的网址来获取网页详细,得到网页的源码 def getHtml(url): html = urllib.request.urlopen(input_url).read().decode('utf8') return html #目录是否存在,不存在则创建 def createDir(path): if not os.path.exists(path): os.makedirs(path) else: if os.path.isfile(path): os.mkdir(path) #保存图片 def saveImg(imgList, path = 'img2\\'): createDir(path) imgIndex = 1 for imgUrl in imgList: #打开imgList中保存的图片网址,并下载图片保存在本地,format格式化字符 urllib.request.urlretrieve(imgUrl,'{}{}.jpg'.format(path,imgIndex)) imgIndex += 1 print('------ over -------') html = getHtml(input_url) #匹配图片标签 reg = r'src="(.+?\.jpg|png|gif)"' imgre = re.compile(reg, re.I) imglist = re.findall(imgre, html) if len(imglist): saveImg(imglist) else: print('game over! No img to spider')
第二种、浏览器伪装访问抓取
必须带有头部header信息,例如User-Agent、Referer等信息,否则将拒绝访问,无法抓取
headers = { 'Accept':'application/json, text/plain, */*', 'Accept-Language':'zh-CN,zh;q=0.3', 'Referer':'https://item.taobao.com/item.htm', 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 'Connection':'keep-alive', } # 定义代理 proxy_addr = { 'http': '88.146.227.253:8080' } # details_content = requests.get(input_url, headers=headers,proxies = proxy_addr).text details_content = requests.get(input_url, headers=headers,proxies = proxy_addr).content.decode('gbk', 'ignore') html = etree.HTML(details_content) ###XPath匹配 imgList = html.xpath('//img/@src')