爬取淘宝上面图片
1 #自动爬取淘宝图片 2 #先看看淘宝上月饼相连的三个链接的例子(分别是第二页、第三页、第四页): 3 #可以看到&p4ppushleft=1%2C48&s=44,88,132 每跳转一页增加44 这里是跳转页面抓图片的关键 4 ''' 5 https://s.taobao.com/search?q=%E6%9C%88%E9%A5%BC 6 &imgfile=&commend=all 7 &ssid=s5-e 8 &search_type=item 9 &sourceId=tb.index 10 &spm=a21bo.2017.201856-taobao-item.1 11 &ie=utf8 12 &initiative_id=tbindexz_20170306 13 &bcoffset=3 14 &ntoffset=3 15 &p4ppushleft=1%2C48&s=44''' 16 17 ''' 18 https://s.taobao.com/search?q=%E6%9C%88%E9%A5%BC 19 &imgfile=&commend=all 20 &ssid=s5-e&search_type=item 21 &sourceId=tb.index 22 &spm=a21bo.2017.201856-taobao-item.1 23 &ie=utf8&initiative_id=tbindexz_20170306 24 &bcoffset=0 25 &ntoffset=6 26 &p4ppushleft=1%2C48&s=88''' 27 28 ''' 29 https://s.taobao.com/search?q=%E6%9C%88%E9%A5%BC 30 &imgfile=&commend=all 31 &ssid=s5-e&search_type=item 32 &sourceId=tb.index 33 &spm=a21bo.2017.201856-taobao-item.1 34 &ie=utf8 35 &initiative_id=tbindexz_20170306 36 &bcoffset=-3 37 &ntoffset=-3 38 &p4ppushleft=1%2C48&s=132''' 39 import urllib.request 40 import re 41 keyname="月饼" 42 #汉字编码keyname可任意更换 43 key=urllib.request.quote(keyname) 44 45 #伪装成google浏览器报头 去浏览器中找到 use-Agent 46 headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36") 47 #创建oenener对象 48 opener=urllib.request.build_opener() 49 opener.addheaders=[headers] 50 #把opener添加为全局 51 urllib.request.install_opener(opener) 52 53 for i in range(0,3): 54 url="https://s.taobao.com/search?q="+key+"&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=-3&ntoffset=-3&p4ppushleft=1%2C48&s="+str(i*44); 55 data=urllib.request.urlopen(url).read().decode("utf-8","ignore") 56 #找某一个图片的地址关键(加黑地方)https://g-search1.alicdn.com/img/bao/uploaded/i4/i3/1743116853/TB2mzuwaxnaK1RjSZFtXXbC2VXa_!!1743116853-0-item_pic.jpg_180x180.jpg_.webp 在淘宝后台利用正则找到图片的链接 57 pat='pic_url":"//(.*?)"' 58 imagelist=re.compile(pat).findall(data) 59 for j in range(0,len(imagelist)): 60 thisimg=imagelist[j] 61 thisimagurl="http://"+thisimg 62 file="F:/python/python爬虫/img/"+str(i)+str(j)+".jpg" 63 urllib.request.urlretrieve(thisimagurl,filename=file)