苏宁图书信息爬取

效果如下:

 

 

 

 

文档说明:

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

version_1

下页完美请求到了,果然世上无难事,只要敢放弃。

下页的构造请求应该请求静态接口,这样既会正确返回数据,而且数据也是完整的

总结:有时候在构造url的时候,不妨多删几个参数,在浏览器中尝试,排除非相关参数

current_page = re.findall(r'param.currentPage = "(.*?)";', response.body.decode(), re.S)
current_page = int(current_page[0]) if current_page else -1
# page_numbers = int(re.findall(r'param.pageNumbers = "(.*?)";', response.body.decode(), re.S)[0])
page_numbers = re.findall(r'param.pageNumbers = "(.*?)";', response.body.decode(), re.S)
page_numbers = int(page_numbers[0]) if page_numbers else -1
print("*"*50)
print("current_page:%s,page_numbers:%s"%(current_page,page_numbers))
print("*"*50)
ci = int(re.findall(r"cateid':\'(.*?)\'",response.body.decode(),re.S)[0])

url = "https://list.suning.com/1-{}-{}.html" # 在浏览器端,后面加上一个页码编号,是可行的,但是当请求第4页时,就会抓不到数据
num = 1
# next_request_url = "https://list.suning.com/emall/showProductList.do?ci={}&pg=03&cp={}&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=728"
while num < page_numbers:
yield scrapy.Request(
url = url.format(ci,num),
callback = self.parse_s_cate_href,
meta = {"item":deepcopy(response.meta["item"])}
)
num += 1

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

version_0


https://list.suning.com/1-502687-0.html页面分析

<script type="text/javascript">
if (typeof param != 'undefined') {
param.currentPage = "0";
param.pageNumbers = "100";
param.numFound = "27537";
if ("ssdln_502687" == "{pageType}") {
makeProductName($("#filter-results,#bottom_pager"));
}
param.mutil = false;

记住每个页面都有记录当页currentpage和pagenumbers



class = filter-results productMain clearfix temporary下为有用信息
分别有图书列表,和下一页拦

一定要要请求的url是否Filtered offsite request to 'list.suning.com':


分析价格的请求地址
https://product.suning.com/0071038629/11855521483.html

https://product.suning.com/0071014399/11516769347.html
https://pas.suning.com/nspcsale_0_000000011516769347_000000011516769347_0071014399_
170_728_7280100_502282_1000149_9149_11181_Z001___R9011205_3.0________0___0.0_2_.html

https://product.suning.com/0071038629/11855521483.html
https://pas.suning.com/nspcsale_0_000000011855521483_000000011855521483_0071038629_
170_728_7280100_502282_1000149_9149_11181_Z001___R9011205_10.0________0___0.0_2_.html

https://product.suning.com/0070091633{shopid}/10717510914{prdid}.html
https://pas.suning.com/nspcsale_0_000000010717510914_000000010717510914_0070091633_
170_728_7280100_502282_1000149_9149_11181_Z001___R9011205_1.0________0___1.0_2_.html

# 构造请求价格的url
https://pas.suning.com/nspcsale_0_0000000{prdid}_0000000{prdid}_{shopid}_
170_728.html


小分类的下页分析

第一页 一个小分类下,有一个ci
ci的获取方式
https://list.suning.com/emall/showProductList.do?ci=502675&pg=03&cp=0&il=0&iy=0
&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&paging=1&sub=0
第二页
第一次
https://list.suning.com/emall/showProductList.do?ci=502675&pg=03&cp=1&il=0&iy=0
&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=728
第二次
https://list.suning.com/emall/showProductList.do?ci=502675&pg=03&cp=1&il=0&iy=0
&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=728&paging=1&sub=0
第三页
第一次
https://list.suning.com/emall/showProductList.do?ci=502675&pg=03&cp=2&il=0&iy=0
&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=728
第二次
https://list.suning.com/emall/showProductList.do?ci=502675&pg=03&cp=2&il=0&iy=0
&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=728&paging=1&sub=0

翻页功能未实现,构造下页请求失败。

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

源码如下:

# -*- coding: utf-8 -*-
import scrapy
from copy import deepcopy
import re
from pprint import pprint
import json


class SntsSpider(scrapy.Spider):
    name = 'snts'
    allowed_domains = ['book.suning.com','list.suning.com','product.suning.com','pas.suning.com']
    start_urls = ['https://book.suning.com/']

    def parse(self, response):
        # with open("test.txt",'w',encoding='utf8') as f:
        #     f.write(response.body.decode())
        item = dict()
        # 获取大分类
        b_cate_list = response.xpath("//div[@class='menu-list']//div[@class='menu-item']//h3/a")
        num = 1     # 计数
        for b_cate in b_cate_list:
            item["b_cate"] = b_cate.xpath("./text()").extract_first()   # 取大分类名字
            # 获取中分类
            # 每个大分类对应一个右边的展开,一一对应
            menu_sub_one = response.xpath("//div[@class='menu-list']//div[@class='menu-sub'][{}]".format(num))
            # 获取展开的左边
            submenu_left = menu_sub_one.xpath(".//div[@class='submenu-left']")
            # 获取p_list
            p_list = submenu_left.xpath("./p[@class='submenu-item']")
            # num_计数
            num_ = 1
            for p in p_list:
                item["m_cate"] = p.xpath("./a/text()").extract_first()  # 获取中标题
                # 获取中标题下的小标题分类
                s_cate_list = submenu_left.xpath("./ul[@class='book-name-list clearfix'][{}]/li".format(num_))
                for s_cate in s_cate_list:
                    item["s_cate"] = s_cate.xpath("./a/text()").extract_first()
                    s_cate_href = s_cate.xpath("./a/@href").extract_first()
                    yield scrapy.Request(
                        url=s_cate_href,
                        callback=self.parse_s_cate_href,
                        meta={"item":deepcopy(item)}
                    )
                num_ += 1
            num += 1

    def parse_s_cate_href(self, response):
        item = deepcopy(response.meta["item"])
        item["book_info"] = dict()
        # 获取有用信息,分别包含图书图书列表和下一页地址,当前页信息currentPage和总页数信息pageNumbers可通过re提取
        useful_info = response.xpath("//div[@class='filter-results productMain clearfix  temporary']")
        # print(useful_info)
        # 定位图书列表
        book_li_list = useful_info.xpath(".//ul[@class='clearfix']/li")
        # 拿到图书标题,图片信息和地址
        for book_li in book_li_list:
            item["book_info"]["title"] = book_li.xpath(".//div[@class='res-img']/div[@class='img-block']/a/img/@alt").extract_first()
            item["book_info"]["image"] = book_li.xpath(".//div[@class='res-img']/div[@class='img-block']/a/img/@src2").extract_first()
            item["book_info"]["image"] = "https:"+item["book_info"]["image"] if item["book_info"]["image"] else None
            item["book_info"]["href"] = book_li.xpath(".//div[@class='res-img']/div[@class='img-block']/a/@href").extract_first()
            item["book_info"]["href"] = "https:"+item["book_info"]["href"]
            yield scrapy.Request(
                url = item["book_info"]["href"],
                callback = self.parse_book_href,
                meta = {"item":deepcopy(item)}
            )

        # 以下内容未成功,再接再厉
        # 构造下一页的请求
        # 说明,如果请求showProductList.do?ci=502675*只能拿到前段或后断的数据
        # 但是通过https://list.suning.com/1-502675-4-0-0-0-0-0-0-4.html这种方式,可以拿到全部的数据,但是构造困难
        # pageNumbers = re.findall(r'param.pageNumbers = "(.*?)";',response.body.decode(),re.S)
        # pageNumbers = int(pageNumbers[0]) if pageNumbers else -1
        # nextPage = re.findall(r'param.currentPage = "(.*?)";', response.body.decode(), re.S)
        # nextPage = int(nextPage[0]) + 1 if nextPage else 1
        # print("nextPage:",nextPage)
        # print("pageNumbers:",pageNumbers)
        # # url = re.findall(r"")
        # next_url = "https://list.suning.com/1-502697-{}-0-0-0-0-0-0-4.html"
        # while nextPage<pageNumbers:
        #     yield scrapy.Request(
        #         url = next_url.format(nextPage),
        #         callback = self.parse_s_cate_href,
        #         meta = {"item":deepcopy(item)}
        #     )



        # pageNumbers = int(re.findall(r'param.pageNumbers = "(.*?)";',response.body.decode(),re.S)[0])
        # # pageNumbers = int(pageNumbers[0]) if pageNumbers else -1
        # nextPage = int(re.findall(r'param.currentPage = "(.*?)";', response.body.decode(), re.S)[0])+1
        # # nextPage = int(nextPage[0]) + 1 if nextPage else 1
        # print("nextPage:",nextPage)
        # print("pageNumbers:",pageNumbers)
        # # 拿到ci
        # ci = re.findall(r"cateid':\'(.*?)\'",response.body.decode(),re.S)[0]
        # print(ci)
        # # # 请求上半部分数据,cp即当前的页数
        # body ={
        #     "Referer":response.url
        # }
        next_url_up = "https://list.suning.com/emall/showProductList.do?ci=502687&pg=03&cp=0&il=0&iy=0&adNumber=0&n=1&ch=4&prune=0&sesab=ACBAABC&id=IDENTIFYING&cc=728&paging=1&sub=0"
        # while nextPage<pageNumbers:
        #     yield scrapy.Request(
        #         url = next_url_up.format(ci,nextPage),
        #         callback = self.parse_s_cate_href,
        #         meta = {"item":deepcopy(item)}
        #     )
        # while nextPage<pageNumbers:
        #     yield scrapy.Request(
        #         url = next_url_up.format(nextPage),
        #         callback = self.parse_s_cate_href,
        #         body = json.dumps(body),
        #         meta = {"item":deepcopy(item)}
        #     )
        # # 请求上半部分数据
        # next_url_down = ""
        # while nextPage<pageNumbers:
        #     yield scrapy.Request(
        #         url = next_url_down.format(ci,nextPage),
        #         callback = self.parse_s_cate_href,
        #         meta = {"item":deepcopy(item)}
        #     )


    def parse_book_href(self,response):
        item = response.meta["item"]
        item["book_info"]["author"] = response.xpath("//ul[@class='bk-publish clearfix']/li[1]/text()").extract_first()
        item["book_info"]["author"] = re.sub(r'\r+|\t+|\n+|\|', "", item["book_info"]["author"].strip()) if item["book_info"]["author"] else None
        item["book_info"]["book_publish"] = response.xpath("//ul[@class='bk-publish clearfix']/li[2]/text()").extract_first()
        item["book_info"]["book_publish"] = item["book_info"]["book_publish"].strip() if item["book_info"]["book_publish"] else None
        item["book_info"]["book_description"] = response.xpath("//img[@onload='if(this.width>750){this.height=this.height*(750.0/this.width); this.width = 750;}']/@src2").extract()
        item["book_info"]["book_description"] = ["https:"+i for i in item["book_info"]["book_description"]] if item["book_info"]["book_description"] else None
        shopid, prdid = zip(re.findall(r'com/(.*?)/(.*?)\.html',response.url,re.S)[0])
        price_url = 'https://pas.suning.com/nspcsale_0_0000000{}_0000000{}_{}_170_728.html'.format(prdid[0],prdid[0],shopid[0])
        yield scrapy.Request(
            url = price_url,
            callback = self.parse_price,
            meta = {"item":deepcopy(item)}
        )

    def parse_price(self,response):
        item = response.meta["item"]
        item["book_info"]["book_price"] = re.findall(r'"promotionPrice":"(.*?)"', response.body.decode(), re.S)[0]
        pprint(item)

 

posted @ 2020-05-27 18:36  Norni  阅读(391)  评论(0编辑  收藏  举报