Scrapy框架 之采集某网站产品(按分类采集)
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # import scrapy class HongxingItem(scrapy.Item): # define the fields for your item here like: catname = scrapy.Field() name = scrapy.Field() ico = scrapy.Field()
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: import json import os import requests class HongxingPipeline(object): count = 0 cat_dict = {} def __init__(self): pass def process_item(self, item, spider): self.count += 1 # 判断分类文件夹是否存在 if not os.path.exists(item["catname"]): os.mkdir(item["catname"]) cat_product_count = self.cat_dict.get(item["catname"]) if cat_product_count is None: count = 1 else: count = int(cat_product_count) + 1 self.cat_dict[item["catname"]] = count # 网络图片保存到本地 response = requests.get(item['ico']) file = item["catname"] + '/' + str(count) + ".jpg" with open(file,'wb') as f: f.write(response.content) f.close() with open( item["catname"] + '/' + str(count) +'.txt','w') as f: f.write(item['name']) f.close() return item def close_spider(self, spider): print("总共采集:{0}".format(str(self.count)))
# -*- coding: utf-8 -*- import scrapy from ..items import HongxingItem import requests from lxml import etree class ProductSpider(scrapy.Spider): name = 'product' allowed_domains = [''] host = "" url = host + '/products.asp?Small_Class=16&page={0}' start_urls = [ {"name": '条形连接器', "url": host + '/products.asp?Small_Class=2&page={0}'}, {"name": '贴片式连接器', "url": host + '/products.asp?Small_Class=3&page={0}'}, {"name": '车用连接器', "url": host + '/products.asp?Small_Class=4&page={0}'}, {"name": '洗衣机连接器', "url": host + '/products.asp?Small_Class=5&page={0}'}, {"name": '空调冰箱插件', "url": host + '/products.asp?Small_Class=6&page={0}'}, {"name": '保险丝管连接器', "url": host + '/products.asp?Small_Class=7&page={0}'}, {"name": '电源骨架系列', "url": host + '/products.asp?Small_Class=8&page={0}'}, {"name": '微波炉连接器', "url": host + '/products.asp?Small_Class=9&page={0}'}, {"name": '硬护套系列', "url": host + '/products.asp?Small_Class=10&page={0}'}, {"name": '软护套系列', "url": host + '/products.asp?Small_Class=11&page={0}'}, {"name": '端子系列', "url": host + '/products.asp?Small_Class=12&page={0}'}, {"name": '特种连接器', "url": host + '/products.asp?Small_Class=13&page={0}'}, {"name": '机械手粉碎机', "url": host + '/products.asp?Small_Class=16&page={0}'}, ] # 获取总页数 def get_all_page(self, url): response = requests.get(url) html = etree.HTML(response.content, parser=etree.HTMLParser()) res = html.xpath('//ul[@class="pagination"]') if len(res) > 0: u = res[0].xpath("./li[last()]//a/@href")[0] return int(u.split('page=')[1]) return 1 def start_requests(self): print(self.start_urls) for item in self.start_urls: # 获取总共有多少页 url = item.get('url') total_page = self.get_all_page(url.format('1')) for page in range(1, total_page+1): link = url.format(str(page)) yield scrapy.Request(link, callback=self.parse, meta={"url": link, "name": item.get('name')}) def parse(self, response): meta = response.meta print("当前采集链接:{0}".format(meta['url'])) for each in response.xpath('//div[@class="product_list wow fadeInUp"]//ul//li'): url = each.xpath("./a/@href").extract()[0] # print(url) item = HongxingItem() item['catname'] = meta['name'] yield scrapy.Request( + '/' + url, callback=self.url_parse, meta={"item": item}) def url_parse(self, response): item = response.meta['item'] item['name'] = response.xpath("//div[@class='product_t']//h3//text()").extract()[0] item['ico'] = + response.xpath("//div[@id='product_show_01']//img/@src").extract()[0] yield item