python3下爬取网页上的图片的爬虫程序

 1 import urllib.request
 2 import re
 3 #py抓取页面图片并保存到本地
 4 
 5 #获取页面信息
 6 def getHtml(url):
 7     html = urllib.request.urlopen(url).read()
 8     return html
 9 
10 #通过正则获取图片
11 def getImg(html):
12     reg = 'src="(.+?\.jpg)" pic_ext'
13     imgre = re.compile(reg)
14     imglist = re.findall(imgre,html)
15    # print(imglist)
16     return imglist
17 
18 html = getHtml("http://*****")
19 
20 list=getImg(html.decode())
21 
22 #循环把图片存到本地
23 x = 0
24 for imgurl in list:
25     print(x)
26     urllib.request.urlretrieve(imgurl,'d:\\%s.jpg'% x)
27     x+=1
28 
29 print("done")

指定网页获取图片并保存到AWS_s3

 1 import boto3
 2 import urllib.request
 3 
 4 
 5 def lambda_handler(request, context):
 6     #download_url = "https://s3.amazonaws.com/testforcustomerservice/192x192.png"
 7     download_url = "https://gss2.bdstatic.com/-fo3dSag_xI4khGkpoWK1HF6hhy/baike/s%3D220/sign=3707d191fa03738dda4a0b20831bb073/279759ee3d6d55fb3cfdd81761224f4a20a4ddcc.jpg"
 8     list = download_url.split('/')
 9     upload_key = list[len(list)-1]
10     response = urllib.request.urlopen(url=download_url)
11     context = response.read()
12     #print(context)
13     bucket = "testforcustomerservice"
14     s3 = boto3.resource("s3")
15     file_obj = s3.Bucket(bucket).put_object(Key=upload_key, Body=context)
16     print(file_obj)
17     response = {
18         "url": "https://s3.amazonaws.com/testforcustomerservice/"+upload_key
19     }
20     return response

 

posted @ 2018-10-12 19:10  sen_c7  阅读(762)  评论(0编辑  收藏  举报