python3下爬取网页上的图片的爬虫程序
1 import urllib.request 2 import re 3 #py抓取页面图片并保存到本地 4 5 #获取页面信息 6 def getHtml(url): 7 html = urllib.request.urlopen(url).read() 8 return html 9 10 #通过正则获取图片 11 def getImg(html): 12 reg = 'src="(.+?\.jpg)" pic_ext' 13 imgre = re.compile(reg) 14 imglist = re.findall(imgre,html) 15 # print(imglist) 16 return imglist 17 18 html = getHtml("http://*****") 19 20 list=getImg(html.decode()) 21 22 #循环把图片存到本地 23 x = 0 24 for imgurl in list: 25 print(x) 26 urllib.request.urlretrieve(imgurl,'d:\\%s.jpg'% x) 27 x+=1 28 29 print("done")
指定网页获取图片并保存到AWS_s3
1 import boto3 2 import urllib.request 3 4 5 def lambda_handler(request, context): 6 #download_url = "https://s3.amazonaws.com/testforcustomerservice/192x192.png" 7 download_url = "https://gss2.bdstatic.com/-fo3dSag_xI4khGkpoWK1HF6hhy/baike/s%3D220/sign=3707d191fa03738dda4a0b20831bb073/279759ee3d6d55fb3cfdd81761224f4a20a4ddcc.jpg" 8 list = download_url.split('/') 9 upload_key = list[len(list)-1] 10 response = urllib.request.urlopen(url=download_url) 11 context = response.read() 12 #print(context) 13 bucket = "testforcustomerservice" 14 s3 = boto3.resource("s3") 15 file_obj = s3.Bucket(bucket).put_object(Key=upload_key, Body=context) 16 print(file_obj) 17 response = { 18 "url": "https://s3.amazonaws.com/testforcustomerservice/"+upload_key 19 } 20 return response