Scrapy框架 之采集某网站产品(按分类采集)

一、效果图

 

二、示例代码

1、items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class HongxingItem(scrapy.Item):
    # define the fields for your item here like:
    catname = scrapy.Field()
    name = scrapy.Field()
    ico = scrapy.Field()

2、pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import os
import requests

class HongxingPipeline(object):
    count = 0
    cat_dict = {}

    def __init__(self):
        pass

    def process_item(self, item, spider):
        self.count += 1

        # 判断分类文件夹是否存在
        if not os.path.exists(item["catname"]):
            os.mkdir(item["catname"])
        
        cat_product_count = self.cat_dict.get(item["catname"])
        if cat_product_count is None:
            count = 1
        else:
            count = int(cat_product_count) + 1
        
        self.cat_dict[item["catname"]] = count
        
        # 网络图片保存到本地
        response = requests.get(item['ico'])
        file = item["catname"] + '/' + str(count) + ".jpg"
        with open(file,'wb') as f:
            f.write(response.content)
            f.close()

        with open( item["catname"] + '/' + str(count) +'.txt','w') as f:
            f.write(item['name'])
            f.close()
        return item

    def close_spider(self, spider):
        print("总共采集:{0}".format(str(self.count)))

 

3、product.py

# -*- coding: utf-8 -*-
import scrapy
from ..items import HongxingItem
import requests
from lxml import etree

class ProductSpider(scrapy.Spider):
    name = 'product'
    allowed_domains = ['hxdy.cn']
    host = "http://www.hxdy.cn"
    url = host + '/products.asp?Small_Class=16&page={0}'

    start_urls = [ 
        {"name": '条形连接器', "url": host + '/products.asp?Small_Class=2&page={0}'},
        {"name": '贴片式连接器', "url": host + '/products.asp?Small_Class=3&page={0}'},
        {"name": '车用连接器', "url": host + '/products.asp?Small_Class=4&page={0}'},
        {"name": '洗衣机连接器', "url": host + '/products.asp?Small_Class=5&page={0}'},
        {"name": '空调冰箱插件', "url": host + '/products.asp?Small_Class=6&page={0}'},
        {"name": '保险丝管连接器', "url": host + '/products.asp?Small_Class=7&page={0}'},
        {"name": '电源骨架系列', "url": host + '/products.asp?Small_Class=8&page={0}'},
        {"name": '微波炉连接器', "url": host + '/products.asp?Small_Class=9&page={0}'},
        {"name": '硬护套系列', "url": host + '/products.asp?Small_Class=10&page={0}'},
        {"name": '软护套系列', "url": host + '/products.asp?Small_Class=11&page={0}'},
        {"name": '端子系列', "url": host + '/products.asp?Small_Class=12&page={0}'},
        {"name": '特种连接器', "url": host + '/products.asp?Small_Class=13&page={0}'},
        {"name": '机械手粉碎机', "url": host + '/products.asp?Small_Class=16&page={0}'},
    ]

    # 获取总页数
    def get_all_page(self, url):
        response = requests.get(url)
        html = etree.HTML(response.content, parser=etree.HTMLParser())
        res = html.xpath('//ul[@class="pagination"]')
        if len(res) > 0:
            u = res[0].xpath("./li[last()]//a/@href")[0]
            return int(u.split('page=')[1])
        return 1
    
    def start_requests(self):
        print(self.start_urls)
        for item in self.start_urls:
            # 获取总共有多少页
            url = item.get('url')
            total_page = self.get_all_page(url.format('1'))
            for page in range(1, total_page+1):
                link = url.format(str(page))
                yield scrapy.Request(link, callback=self.parse, meta={"url": link, "name": item.get('name')})

    def parse(self, response):
        meta = response.meta
        print("当前采集链接:{0}".format(meta['url']))

        for each in response.xpath('//div[@class="product_list wow fadeInUp"]//ul//li'):
            url = each.xpath("./a/@href").extract()[0]
            # print(url)
            item = HongxingItem()
            item['catname'] = meta['name']
            yield scrapy.Request(self.host + '/' + url, callback=self.url_parse, meta={"item": item})


    def url_parse(self, response):
        item = response.meta['item']
        item['name'] = response.xpath("//div[@class='product_t']//h3//text()").extract()[0]
        item['ico'] = self.host +  response.xpath("//div[@id='product_show_01']//img/@src").extract()[0]
        yield item

 

posted @ 2023-01-03 10:13  样子2018  阅读(79)  评论(0编辑  收藏  举报