scrapy爬取趣头条
# -*- coding: utf-8 -*- import scrapy from ..items import QutoutiaoItem import json import re from ..settings import CATEGORY_INFO, LIST_LIMIT class QutoutiaoSpider(scrapy.Spider): name = 'qutoutiao' #allowed_domains = ['qutoutiao.net'] start_urls = [] # 各类小标题 categoryInfo = CATEGORY_INFO limit = LIST_LIMIT for value in categoryInfo: url = BASE_API + "cid=%s&tn=1&page=1&limit=%s" % ( str(value['cid']), str(limit)) start_urls.append(url) def parse(self, response): response_url = response.url # 分类id从url获取一次 searchObj = re.search(r'(.*)cid=(\d+)', response_url) cid = searchObj and searchObj.group(2) or 0 data = json.loads(response.text)['data']['data'] for value in data: # 初始化模型对象 item = QutoutiaoItem() # 来源 item['source_name'] = value['source_name'] # 标题 item['title'] = value['title'] # 详细页url url = item['url'] = value['url'] # url = url[0:url.find('?')] # 简介 item['introduction'] = value['introduction'] # 封面图 item['cover'] = value['cover'] # 发布时间 item['publish_time'] = value['publish_time'] # 分类 item['cid'] = cid # 爬取详情页 yield scrapy.Request(url=item['url'], meta={'meta_item': item}, callback=self.detail_parse) # 详情页 def detail_parse(self, response): # 提取每次Response的meta数据 meta_item = response.meta['meta_item'] # 取内容 content_selector = response.xpath('//div[@class="content"]') meta_item['content_images'] = content_selector.xpath( '//img/@src|//img/@data-src').extract() meta_item['content'] = content_selector.extract()[0] yield meta_item # 列表API BASE_API = 'http://api.1sapp.com/content/outList?' # 爬取地址
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class QutoutiaoItem(scrapy.Item): # define the fields for your item here like: # 文章id aid = scrapy.Field() # 来源 source_name = scrapy.Field() # 标题 title = scrapy.Field() # 详细页url url = scrapy.Field() # 简介 introduction = scrapy.Field() # 封面图 cover = scrapy.Field() # 发布时间 publish_time = scrapy.Field() # 分类ID cid = scrapy.Field() # 内容 content = scrapy.Field() # 内容-中的图片 content_images = scrapy.Field()
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # http://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class QutoutiaoSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) from fake_useragent import UserAgent import logging class UserAgent_CookiesMiddleware(object): # 随机更换user-agent def __init__(self, crawler): super(UserAgent_CookiesMiddleware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random") self.logger = logging.getLogger(__name__) @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): def get_ua(): return getattr(self.ua, self.ua_type) random_agent = get_ua() if random_agent: # 记录 request.headers['User-Agent'] = random_agent request.headers['Accept'] = 'application/json, text/javascript, */*; q=0.01' request.headers['Origin'] = 'http://home.qutoutiao.net' request.headers['Referer'] = 'http://home.qutoutiao.net/pages/home.html' self.logger.debug('Current UserAgent: ' + random_agent)
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import scrapy from scrapy.utils.project import get_project_settings from scrapy.pipelines.images import ImagesPipeline import os from .qttutils import QttUtils # 封面下载 class CoverImagePipeline(ImagesPipeline): # 获取settings中的常量 IMAGES_STORE = get_project_settings().get('IMAGES_STORE') # 下载图片 def get_media_requests(self, item, info): cover_images = item['cover'] if cover_images: for image_url in cover_images: yield scrapy.Request(url=image_url) # 下载完成 def item_completed(self, results, item, info): # print('*'*20,results,item,info) image_path = [x['path'] for ok, x in results if ok] # 获取自定义存储路径 store_path = QttUtils.getStorePath() coverImages = [] # 将图片移动到新的路径 print('------------------image_path-',image_path) print('-----------------type(image_path)--', type(image_path)) if image_path: for image_url in image_path: file_name = os.path.split(str(image_url)) print('------------------file_name-', file_name) print('------------------file_name-', type(file_name)) new_image = store_path + '/' + file_name[1] coverImages.append(new_image) os.rename(self.IMAGES_STORE + '/' + image_url, new_image) item['cover'] = coverImages return item # 内容图片下载 class ContentImagePipeline(ImagesPipeline): # 获取settings中的常量 IMAGE_STORE = get_project_settings().get('IMAGES_STORE') # 下载图片 def get_media_requests(self, item, info): content_images = item['content_images'] if content_images: for image_url in content_images: yield scrapy.Request(image_url) # 下载完成 def item_completed(self, results, item, info): image_path = [x['path'] for ok, x in results if ok] print('---------------------image_path', image_path) # 获取自定义存储路径 store_path = QttUtils.getStorePath() contentImages = [] # 将图片移动到新的路径 if image_path: for base_path in image_path: print('----------------value', base_path) file_name = os.path.split(str(base_path)) new_image = store_path + "/" + file_name[1] contentImages.append(new_image) os.rename(self.IMAGE_STORE + "/" + base_path, new_image) item['content_images'] = contentImages return item # # 下载完成 方法一 # def item_completed(self, results, item, info): # image_info = [(x['path'], x['url']) for ok, x in results if ok] # print('---------------------image_info', image_info) # # 获取自定义存储路径 # store_path = QttUtils.getStorePath() # contentImages = [] # # 将图片移动到新的路径 # if image_info: # for value in image_info: # print('----------------value', value) # image_url = value[0] # image_source = value[1] # # file_name = os.path.split(str(image_url)) # new_image = store_path + "/" + file_name[1] # contentImages.append((new_image, image_source)) # os.rename(self.IMAGE_STORE + "/" + image_url, new_image) # item['content_images'] = contentImages # return item import json from .qttutils import QttUtils class QutoutiaoPipeline(object): def __init__(self): # 获取自定义的存储路径 store_path = QttUtils.getStorePath() json_path = store_path + '/' + 'qutoutiao.json' self.filename = open(json_path, 'wb') def process_item(self, item, spider): text = json.dumps(dict(item), ensure_ascii=False) + '\n' self.filename.write(text.encode('utf-8')) return item def close_spider(self, spider): self.filename.close()
# -*- coding: utf-8 -*- # @Time : 2018-6-1 11:01 # @Author : Amir # @Site : # @File : qttutils.py # @Software: PyCharm ''' 趣头条工具类 ''' import time import os import shutil from .settings import DATA_PATH class QttUtils: # 获取存储路径 # # @param [string] action [remove删除目录,默认create] # @return [string] path/year/month/day/* @staticmethod def getStorePath(action='create'): localtimes = time.localtime() year = time.strftime("%Y", localtimes) month = time.strftime('%m', localtimes) day = time.strftime('%d', localtimes) store_path = DATA_PATH + "/%s/%s/%s"%(year,month,day) # 删除目录 if os.path.exists(store_path) and action == 'remove': shutil.rmtree(store_path) # 创建多级目录 if not os.path.exists(store_path) and action == 'create': os.makedirs(store_path) return store_path
# -*- coding: utf-8 -*- # Scrapy settings for QuTouTiao project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'QuTouTiao' SPIDER_MODULES = ['QuTouTiao.spiders'] NEWSPIDER_MODULE = 'QuTouTiao.spiders' #列表记录数 LIST_LIMIT = 10 # 储存路径 DATA_PATH = r'./data' IMAGES_STORE = r'./image' #分类 CATEGORY_INFO = [ {"cid":255,"name":"推荐"}, {"cid":1,"name":"热点"}, {"cid":6,"name":"娱乐"}, {"cid":5,"name":"养生"}, {"cid":2,"name":"搞笑"}, {"cid":7,"name":"科技"}, {"cid":8,"name":"生活"}, {"cid":10,"name":"财经"}, {"cid":9,"name":"汽车"}, ] # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'QuTouTiao (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'QuTouTiao.middlewares.QutoutiaoSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'QuTouTiao.middlewares.UserAgent_CookiesMiddleware': 299, } # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'QuTouTiao.pipelines.QutoutiaoPipeline': 300, 'QuTouTiao.pipelines.ContentImagePipeline': 301, 'QuTouTiao.pipelines.CoverImagePipeline': 302 } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'