Scrapy学习-8-ItemLoader
ItemLoader使用
作用
方便管理维护重用xpath或css规则
实例
itemloader+图片处理
# items.py import scrapy from scrapy.loader.processors import MapCompose, Join, TakeFirst from scrapy.loader import ItemLoader import datetime import re def date_convert(value): try: create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date() except Exception as e: create_date = datetime.datetime.now().date() return create_date def get_nums(value): match_re = re.match(".*?(\d+).*", value) if match_re: nums = int(match_re.group(1)) else: nums = 0 return nums def return_value(value): return value def remove_comment_tags(value): #去掉tag中提取的评论 if "评论" in value: return "" else: return value class ArticleItemLoader(ItemLoader): #自定义itemloader default_output_processor = TakeFirst() class ArticlespiderItem(scrapy.Item): # title字段值回在最后加上 -jobbole-abc title = scrapy.Field( input_processor=MapCompose(lambda x:x+"-jobbole", lambda y:y+"-abc") ) create_date = scrapy.Field( input_processor=MapCompose(date_convert), ) url = scrapy.Field() url_object_id = scrapy.Field() # 由于front_image_url需要存储一个列表,不能使用defauls方法获取,为了保持原来的列表,我们重写一个output_processor front_image_url = scrapy.Field( output_processor=MapCompose(return_value) ) front_image_path = scrapy.Field() praise_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) comment_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) fav_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) tags = scrapy.Field( input_processor=MapCompose(remove_comment_tags), output_processor=Join(",") ) content = scrapy.Field() # pipelines.py from scrapy.pipelines.images import ImagesPipeline class ArticleImagePipeline(ImagesPipeline): def item_completed(self, results, item, info): if "front_image_url" in item: for res, value in results: image_path = value['path'] item['front_image_path'] = image_path # 处理完成路径需要将item返回,因为在settings中,配置的了优先级,该pipelines可以将items继续传递给下一个pipelines中 return item # 使用自己的pipelines ITEM_PIPELINES = { 'ArticleSpider.pipelines.ArticlespiderPipeline': 300, 'ArticleSpider.pipelines.ArticleImagePipeline': 1, }