爬取淘宝上面图片

 1 #自动爬取淘宝图片
 2 #先看看淘宝上月饼相连的三个链接的例子(分别是第二页、第三页、第四页):
 3 #可以看到&p4ppushleft=1%2C48&s=44,88,132  每跳转一页增加44  这里是跳转页面抓图片的关键
 4 '''
 5 https://s.taobao.com/search?q=%E6%9C%88%E9%A5%BC
 6 &imgfile=&commend=all
 7 &ssid=s5-e
 8 &search_type=item
 9 &sourceId=tb.index
10 &spm=a21bo.2017.201856-taobao-item.1
11 &ie=utf8
12 &initiative_id=tbindexz_20170306
13 &bcoffset=3
14 &ntoffset=3
15 &p4ppushleft=1%2C48&s=44'''
16 
17 '''
18 https://s.taobao.com/search?q=%E6%9C%88%E9%A5%BC
19 &imgfile=&commend=all
20 &ssid=s5-e&search_type=item
21 &sourceId=tb.index
22 &spm=a21bo.2017.201856-taobao-item.1
23 &ie=utf8&initiative_id=tbindexz_20170306
24 &bcoffset=0
25 &ntoffset=6
26 &p4ppushleft=1%2C48&s=88'''
27 
28 '''
29 https://s.taobao.com/search?q=%E6%9C%88%E9%A5%BC
30 &imgfile=&commend=all
31 &ssid=s5-e&search_type=item
32 &sourceId=tb.index
33 &spm=a21bo.2017.201856-taobao-item.1
34 &ie=utf8
35 &initiative_id=tbindexz_20170306
36 &bcoffset=-3
37 &ntoffset=-3
38 &p4ppushleft=1%2C48&s=132'''
39 import urllib.request
40 import re
41 keyname="月饼"
42 #汉字编码keyname可任意更换
43 key=urllib.request.quote(keyname)
44 
45 #伪装成google浏览器报头  去浏览器中找到 use-Agent
46 headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36")
47 #创建oenener对象
48 opener=urllib.request.build_opener()
49 opener.addheaders=[headers]
50 #把opener添加为全局
51 urllib.request.install_opener(opener)
52 
53 for i in range(0,3):
54     url="https://s.taobao.com/search?q="+key+"&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&bcoffset=-3&ntoffset=-3&p4ppushleft=1%2C48&s="+str(i*44);
55     data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
56 #找某一个图片的地址关键(加黑地方)https://g-search1.alicdn.com/img/bao/uploaded/i4/i3/1743116853/TB2mzuwaxnaK1RjSZFtXXbC2VXa_!!1743116853-0-item_pic.jpg_180x180.jpg_.webp  在淘宝后台利用正则找到图片的链接
57     pat='pic_url":"//(.*?)"'
58     imagelist=re.compile(pat).findall(data)
59     for j in range(0,len(imagelist)):
60         thisimg=imagelist[j]
61         thisimagurl="http://"+thisimg
62         file="F:/python/python爬虫/img/"+str(i)+str(j)+".jpg"
63         urllib.request.urlretrieve(thisimagurl,filename=file)

 

posted @ 2018-09-17 15:24  发酸的丶蛋炒饭  阅读(595)  评论(0编辑  收藏  举报