python -- 京东图书

# -*- coding: utf-8 -*-
import scrapy
import urllib.request
import re
import random
from jdgoods.items import JdgoodsItem
from lxml import etree
from scrapy.http import Request


class GoodsSpider(scrapy.Spider):
    name = 'goods'
    allowed_domains = ['jd.com']
    url_lst=[]
    pd_lst=[]
    pd_pages={}
    #start_urls = ['http://jd.com/']
    ua = ['Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
          'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
          'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
          'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
          'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
          'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'
          ]

    def start_requests(self):

        req1 = urllib.request.Request("https://book.jd.com/")
        user_age=random.choice(self.ua)
        req1.add_header("User-Agent", user_age )
        all_data = urllib.request.urlopen(req1).read().decode('gbk','ignore')
        # print(all_data)
        pat1 = '"URL":"(.*?)","ANCHOR":'
        all_html_data = re.compile(pat1).findall(all_data)
        #print(all_html_data )
        for i in all_html_data:
            a = i.split(',')
            self.url_lst.append("http:"+(a[len(a) - 1].replace('"URL":"', "")).replace('\/','/'))
            #print("http:"+(a[len(a) - 1].replace('"URL":"', "")).replace('\/','/'))
        url_set=set(self.url_lst)
        self.url_lst=list(url_set)

        for j in self.url_lst:
            try:

                req2 = urllib.request.Request(j)
                user_age = random.choice(self.ua)
                req2.add_header("User-Agent", user_age)

                sub_data=urllib.request.urlopen(req2).read().decode('gbk', 'ignore')
                pat2='href="//list.*cat=(.*?)[&|"]'
                all_html_addr = re.compile(pat2).findall(sub_data)
                for lst_num in all_html_addr :
                    self.pd_lst.append(lst_num)

            except Exception as err:
                pass
        x=0
        for a in self.pd_lst:

            this_url = 'https://list.jd.com/list.html?cat=' + a
            req3 = urllib.request.Request(this_url)
            user_age = random.choice(self.ua)
            req3.add_header("User-Agent", user_age)


            html_data = urllib.request.urlopen(req3).read().decode('utf-8', 'ignore')
            pat3 = u"[\u4e00-\u9fa5]"+"<b>(.*?)</b>"+u"[\u4e00-\u9fa5]"
            pages = re.compile(pat3).findall(html_data)
            self.pd_pages[a]="".join(pages)
            x+=1
            if x > 1 :break
        y=0
        for key in self.pd_pages:
            #print(key +":"+ str(self.pd_pages[key]))
            for p in range(1,int(str(self.pd_pages[key]))):
                thispage='https://list.jd.com/list.html?cat='+key+'&page=' + str(p)
                #print(thispage)
                y+=1
                if y>2:break
                #yield Request(thispage, callback=self.parse)

        yield Request("https://list.jd.com/list.html?cat=1713,3260,3339", callback=self.parse)
    def parse(self, response):
        item=JdgoodsItem()

        try:

            content_lst=response.xpath('//span[@class="curr"]/text()').extract()
            p_content="---".join(content_lst)
            print(p_content)

            book_name_lst=response.xpath('//div[@class="p-name"]/a/em/text()').extract()

            book_price_html = response.xpath('//div[@class="p-img"]/a/@href').extract()

            book_pub_lst = response.xpath('//span[@class="p-bi-store"]/a/text()').extract()

            book_seller = response.xpath('//span[@class="curr-shop"]/text()').extract()

            # price https://p.3.cn/prices/mgets?&skuIds=J_11481255
            print("书名--出版社----销售商---已下载")
            skuIds=[]
            price=[]
            comment=[]
            for n in range(len(book_price_html)):
                pat = '//.*/([0-9].*?).html'
                tmp = re.compile(pat).findall(book_price_html[n])
                skuIds.append("".join(tmp))

            for i in range(0,len(book_name_lst)):
                req4 = urllib.request.Request('https://p.3.cn/prices/mgets?&skuIds=J_' + str(skuIds[i]))
                user_age = random.choice(self.ua)
                req4.add_header("User-Agent", user_age)
                p = urllib.request.urlopen(req4).read().decode()
                pat = '"p":"(.*?)"'
                p1 = re.compile(pat).findall(p)
                p2="".join(p1)
                price.append(p2)
            print("书价格---已下载")
            #评论：https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=11338771
            for i in range(0, len(book_name_lst)):
                req5 = urllib.request.Request(
                    'https://club.jd.com/comment/productCommentSummaries.action?my=pinglun&referenceIds=' + str(
                        skuIds[i]))
                user_age = random.choice(self.ua)
                req5.add_header("User-Agent", user_age)
                c = urllib.request.urlopen(req5).read().decode("utf-8", 'ignore')
                pat = '"CommentCount":(.*?),'
                c1 = re.compile(pat).findall(c)
                c2 = "".join(c1)
                comment.append(c2)
            print("书评论---已下载")
            for  n in  range(len(book_name_lst)):
                print(book_name_lst[n]+':'+str(price[n])+':'+book_seller[n]+':'+book_pub_lst[n]+':'+str(comment[n]))
        except Exception as err:
            pass

        yield item
posted @ 2017-12-10 23:28 沧海一粒水阅读(180) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部
python -- 京东图书

公告