爬取知名社区技术文章_items_2
item中定义获取的字段和原始数据进行处理并合法化数据
#!/usr/bin/python3 # -*- coding: utf-8 -*- import scrapy import hashlib import re from scrapy.loader.processors import (MapCompose, TakeFirst, Join) from scrapy.loader import ItemLoader def go_md5(value): # 对cont_url进行md5,作为该表的主键 m = hashlib.md5() if isinstance(value, str): m.update(bytes(value, encoding='utf-8')) # print(type(m.hexdigest())) return m.hexdigest() def go_time(value): # 获取时间,并且格式化时间,raw_t为原始数据,new_t为符合mysql中data类型数据 raw_t = value.strip() if raw_t: median_t = raw_t.replace('·', '') if median_t: time_l = median_t.split('/') new_t = '-'.join(time_l) return new_t.strip() return median_t else: return raw_t def go_cont(value): # 把文章内容中换行和空格去掉 return value.strip() def go_img(value): # 确定图片下载器获取的是列表,下载器获取的图片url对象为列表形式 return value def get_num(value): # 获取评论、点赞、收藏数 num = re.match(r'.*?(\d+).*?', value) if num: return int(num.group(1)) else: return 0 class ArticleItemLoader(ItemLoader): """ 自定义ItemLoader,要求取每个字段列表中第一个值 """ default_output_processor = TakeFirst() class JobboleItem(scrapy.Item): """ input_processor 数据预处理 output_processor 数据返回item数据处理 """ cont_id = scrapy.Field( input_processor=MapCompose(go_md5) ) cont_url = scrapy.Field() # title = scrapy.Field() publish_time = scrapy.Field( input_processor=MapCompose(go_time) ) cont = scrapy.Field( input_processor=MapCompose(go_cont), output_processor=Join('') ) img_url = scrapy.Field( output_processor=MapCompose(go_img) ) link_num = scrapy.Field( input_processor=MapCompose(get_num) ) collection_num = scrapy.Field( input_processor=MapCompose(get_num) ) comment_num = scrapy.Field( input_processor=MapCompose(get_num) ) img_path = scrapy.Field() # 测试 if __name__ == '__main__': result = get_num(' s ss 14 ssss') print(result)