JD 评论晒图爬虫
JD 评论晒图爬虫
#coding=utf-8 import requests import re import os __author__ = 'depy' """ jd 评论晒图爬虫 @productId 商品id @startpage 开始页数 @endpage 结束页数 """ class JDPIC(object): def __init__(self,productId,startpage,endpage=20): self.headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 'Accept':'*/*', 'Accept-Encoding':'gzip, deflate, sdch, br', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cookie':'' } self.url = 'https://club.jd.com/discussion/getProductPageImageCommentList.action' self.startpage = startpage self.productId = productId self.endpage = endpage def sendReq(self,page): params = { 'productId':self.productId, 'isShadowSku':'0', 'callback':'jQuery219465', 'page':page, 'pageSize':20 } r = requests.get(self.url,params=params,headers=self.headers,timeout=10) regex = re.findall(r'"imageUrl":"//(.*?)"',r.text) return regex def downloadImageFile(self,imgUrl): local_filename = imgUrl.split('/')[-1] print "Download Image File=", local_filename imgUrl = 'http://'+imgUrl r = requests.get(imgUrl, headers =self.headers,stream=True, timeout=20) dirName = 'JDPIC1' if not os.path.exists(dirName): os.makedirs(dirName) with open(dirName+'/'+local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) f.flush() f.close() if __name__ == '__main__': J = JDPIC(1111,51,100) #商品id自行修改 #print J.endpage list = range(int(J.startpage),int(J.endpage)+1) for i in list: regexlist = J.sendReq(i) for picurl in regexlist: J.downloadImageFile(picurl) print "downpic success"