Scrapy框架 之采集某电子网站产品
一、创建项目
第一步:scrapy startproject lianhe
第二步:cd lianhe
scrapy genspider product www.lhecn.com.cn
二、示例代码
start.py
from scrapy import cmdline import os if __name__ == '__main__': dirname = os.path.dirname(os.path.abspath(__file__)) os.chdir(dirname) cmdline.execute("scrapy crawl product".split())
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class LianheItem(scrapy.Item): # define the fields for your item here like: catname = scrapy.Field() name = scrapy.Field() pdf = scrapy.Field() pics = scrapy.Field() pass
product.py
# -*- coding: utf-8 -*- import scrapy import requests from lxml import etree from ..items import LianheItem class ProductSpider(scrapy.Spider): name = 'product' allowed_domains = ['www.lhecn.com.cn'] host = 'https://www.lhecn.com.cn' start_urls = [ {"name": '线对板连接器', "url": host + '/category/wire-to-board/page/{0}/'}, {"name": '线对线连接器', "url": host + '/category/wire-to-wire/page/{0}/'}, {"name": '板对板连接器', "url": host + '/category/pc-board-in/page/{0}/'}, {"name": '快插端子(110,187,250)', "url": host + '/category/faston-terminal/page/{0}/'}, {"name": 'FFC&FPC连接器', "url": host + '/category/ffc-fpc-connector/page/{0}/'}, {"name": '接线端子', "url": host + '/category/terminal-block/page/{0}/'}, ] # 获取总页数 def get_all_page(self, url): response = requests.get(url) html = etree.HTML(response.content, parser=etree.HTMLParser()) res = html.xpath('//div[@class="wp-pagenavi"]') if len(res) > 0: p = res[0].xpath("./a[last()-1]//text()")[0] return int(p) return 1 def start_requests(self): for item in self.start_urls: # 获取总共有多少页 url = item.get('url') total_page = self.get_all_page(url.format('1')) print("total_page = " + str(total_page)) for page in range(1, total_page+1): link = url.format(str(page)) yield scrapy.Request(link, callback=self.parse, meta={"url": link, "name": item.get('name')}) def parse(self, response): meta = response.meta for each in response.xpath('//div[@class="pro-imgs col-md-4 col-sm-6 col-xs-6"]'): url = each.xpath("./div[@class='imgallbox imgtitle-h3']//h3//a/@href").extract()[0] name = each.xpath("./div[@class='imgallbox imgtitle-h3']//h3//a//text()").extract()[0] # print(url) # print(name) item = LianheItem() item['catname'] = meta['name'] item['name'] = name.strip() yield scrapy.Request(url, callback=self.url_parse, meta={"item": item}) def url_parse(self, response): item = response.meta['item'] item['pdf'] = response.xpath("//div[@class='pdf_uploads']//a/@href").extract()[0] pics = [] for pic in response.xpath("//div[@class='sp-wrap']//a/@href").extract(): pics.append(pic) item['pics'] = pics yield item
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import os import requests class LianhePipeline(object): count = 0 cat_dict = {} def __init__(self) -> None: pass def process_item(self, item, spider): self.count += 1 # 判断分类文件夹是否存在 if not os.path.exists(item["catname"]): os.mkdir(item["catname"]) cat_product_count = self.cat_dict.get(item["catname"]) if cat_product_count is None: count = 1 else: count = int(cat_product_count) + 1 self.cat_dict[item["catname"]] = count # 网络pdf保存到本地 file = item["catname"] + '/' + str(count) + ".pdf" self.save_file(item['pdf'], file) num = 1 for pic in item['pics']: file = item["catname"] + '/' + str(count) + "_"+str(num)+".jpg" self.save_file(pic, file) num += 1 with open( item["catname"] + '/' + str(count) +'.txt','w') as f: f.write(item['name']) f.close() return item def save_file(self, url, filename): if url == '': return False response = requests.get(url) with open(filename,'wb') as f: f.write(response.content) f.close() def close_spider(self, spider): print("总共采集:{0}".format(str(self.count)))