JD 评论晒图爬虫

JD 评论晒图爬虫

#coding=utf-8
import requests
import re
import os

__author__ = 'depy'

"""
jd 评论晒图爬虫
@productId 商品id
@startpage 开始页数
@endpage 结束页数
"""

class JDPIC(object):
    def __init__(self,productId,startpage,endpage=20):
        self.headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
            'Accept':'*/*',
            'Accept-Encoding':'gzip, deflate, sdch, br',
            'Accept-Language':'zh-CN,zh;q=0.8',
            'Cookie':''
        }
        self.url = 'https://club.jd.com/discussion/getProductPageImageCommentList.action'
        self.startpage = startpage
        self.productId = productId
        self.endpage = endpage

    def sendReq(self,page):
        params = {
            'productId':self.productId,
            'isShadowSku':'0',
            'callback':'jQuery219465',
            'page':page,
            'pageSize':20
        }
        r = requests.get(self.url,params=params,headers=self.headers,timeout=10)
        regex = re.findall(r'"imageUrl":"//(.*?)"',r.text)
        return regex

    def downloadImageFile(self,imgUrl):
        local_filename = imgUrl.split('/')[-1]
        print "Download Image File=", local_filename
        imgUrl = 'http://'+imgUrl
        r = requests.get(imgUrl, headers =self.headers,stream=True, timeout=20)
        dirName = 'JDPIC1'
        if not os.path.exists(dirName):
            os.makedirs(dirName)
        with open(dirName+'/'+local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
                    f.flush()
            f.close()

if __name__ == '__main__':
    J = JDPIC(1111,51,100)  #商品id自行修改
    #print J.endpage
    list = range(int(J.startpage),int(J.endpage)+1)
    for i in list:
        regexlist = J.sendReq(i)
        for picurl in regexlist:
            J.downloadImageFile(picurl)

    print "downpic success"

 

posted @ 2017-06-02 16:29  depycode  阅读(392)  评论(0编辑  收藏  举报