scrapy爬取趣头条

# -*- coding: utf-8 -*-
import scrapy
from ..items import QutoutiaoItem
import json
import re
from ..settings import CATEGORY_INFO, LIST_LIMIT



class QutoutiaoSpider(scrapy.Spider):

    name = 'qutoutiao'
    #allowed_domains = ['qutoutiao.net']
    start_urls = []
    # 各类小标题
    categoryInfo = CATEGORY_INFO
    limit = LIST_LIMIT
    for value in categoryInfo:
        url = BASE_API + "cid=%s&tn=1&page=1&limit=%s" % (
        str(value['cid']), str(limit))
        start_urls.append(url)


    def parse(self, response):

        response_url = response.url
        # 分类id从url获取一次
        searchObj = re.search(r'(.*)cid=(\d+)', response_url)
        cid = searchObj and searchObj.group(2) or 0

        data = json.loads(response.text)['data']['data']

        for value in data:
            # 初始化模型对象
            item = QutoutiaoItem()
            # 来源
            item['source_name'] = value['source_name']
            # 标题
            item['title'] = value['title']
            # 详细页url
            url = item['url'] = value['url']
            # url = url[0:url.find('?')]
            # 简介
            item['introduction'] = value['introduction']
            # 封面图
            item['cover'] = value['cover']
            # 发布时间
            item['publish_time'] = value['publish_time']
            # 分类
            item['cid'] = cid

            # 爬取详情页
            yield scrapy.Request(url=item['url'], meta={'meta_item': item},
                                 callback=self.detail_parse)

    # 详情页
    def detail_parse(self, response):
        # 提取每次Response的meta数据
        meta_item = response.meta['meta_item']
        # 取内容
        content_selector = response.xpath('//div[@class="content"]')
        meta_item['content_images'] = content_selector.xpath(
            '//img/@src|//img/@data-src').extract()
        meta_item['content'] = content_selector.extract()[0]
        yield meta_item





    # 列表API
    BASE_API = 'http://api.1sapp.com/content/outList?'

    # 爬取地址
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class QutoutiaoItem(scrapy.Item):
    # define the fields for your item here like:

    # 文章id
    aid = scrapy.Field()
    # 来源
    source_name = scrapy.Field()
    # 标题
    title = scrapy.Field()
    # 详细页url
    url = scrapy.Field()
    # 简介
    introduction = scrapy.Field()
    # 封面图
    cover = scrapy.Field()
    # 发布时间
    publish_time = scrapy.Field()
    # 分类ID
    cid = scrapy.Field()
    # 内容
    content = scrapy.Field()
    # 内容-中的图片
    content_images = scrapy.Field()
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class QutoutiaoSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


from fake_useragent import UserAgent
import logging


class UserAgent_CookiesMiddleware(object):
    # 随机更换user-agent
    def __init__(self, crawler):
        super(UserAgent_CookiesMiddleware, self).__init__()
        self.ua = UserAgent()
        self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
        self.logger = logging.getLogger(__name__)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        def get_ua():
            return getattr(self.ua, self.ua_type)

        random_agent = get_ua()

        if random_agent:
            # 记录
            request.headers['User-Agent'] = random_agent
            request.headers['Accept'] = 'application/json, text/javascript, */*; q=0.01'
            request.headers['Origin'] = 'http://home.qutoutiao.net'
            request.headers['Referer'] = 'http://home.qutoutiao.net/pages/home.html'

            self.logger.debug('Current UserAgent: ' + random_agent)
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


import scrapy
from scrapy.utils.project import get_project_settings
from scrapy.pipelines.images import ImagesPipeline
import os
from .qttutils import QttUtils


# 封面下载
class CoverImagePipeline(ImagesPipeline):
    # 获取settings中的常量
    IMAGES_STORE = get_project_settings().get('IMAGES_STORE')
    # 下载图片
    def get_media_requests(self, item, info):
        cover_images = item['cover']
        if cover_images:
            for image_url in cover_images:
                yield scrapy.Request(url=image_url)

    # 下载完成
    def item_completed(self, results, item, info):
        # print('*'*20,results,item,info)
        image_path = [x['path'] for ok, x in results if ok]
        # 获取自定义存储路径
        store_path = QttUtils.getStorePath()
        coverImages = []
        # 将图片移动到新的路径
        print('------------------image_path-',image_path)
        print('-----------------type(image_path)--', type(image_path))
        if image_path:
            for image_url in image_path:
                file_name = os.path.split(str(image_url))
                print('------------------file_name-', file_name)
                print('------------------file_name-', type(file_name))
                new_image = store_path + '/' + file_name[1]
                coverImages.append(new_image)
                os.rename(self.IMAGES_STORE + '/' + image_url, new_image)
        item['cover'] = coverImages
        return item

# 内容图片下载
class ContentImagePipeline(ImagesPipeline):
    # 获取settings中的常量
    IMAGE_STORE = get_project_settings().get('IMAGES_STORE')
    # 下载图片
    def get_media_requests(self, item, info):
        content_images = item['content_images']
        if content_images:
            for image_url in content_images:
                yield scrapy.Request(image_url)

    # 下载完成
    def item_completed(self, results, item, info):
        image_path = [x['path'] for ok, x in results if ok]
        print('---------------------image_path', image_path)
        # 获取自定义存储路径
        store_path = QttUtils.getStorePath()
        contentImages = []
        # 将图片移动到新的路径
        if image_path:
            for base_path in image_path:
                print('----------------value', base_path)
                file_name = os.path.split(str(base_path))
                new_image = store_path + "/" + file_name[1]
                contentImages.append(new_image)
                os.rename(self.IMAGE_STORE + "/" + base_path, new_image)
        item['content_images'] = contentImages
        return item

    # # 下载完成 方法一
    # def item_completed(self, results, item, info):
    #     image_info = [(x['path'], x['url']) for ok, x in results if ok]
    #     print('---------------------image_info', image_info)
    #     # 获取自定义存储路径
    #     store_path = QttUtils.getStorePath()
    #     contentImages = []
    #     # 将图片移动到新的路径
    #     if image_info:
    #         for value in image_info:
    #             print('----------------value', value)
    #             image_url = value[0]
    #             image_source = value[1]
    #
    #             file_name = os.path.split(str(image_url))
    #             new_image = store_path + "/" + file_name[1]
    #             contentImages.append((new_image, image_source))
    #             os.rename(self.IMAGE_STORE + "/" + image_url, new_image)
    #     item['content_images'] = contentImages
    #     return item

import json
from .qttutils import QttUtils


class QutoutiaoPipeline(object):

    def __init__(self):
        # 获取自定义的存储路径
        store_path = QttUtils.getStorePath()
        json_path = store_path + '/' + 'qutoutiao.json'
        self.filename = open(json_path, 'wb')

    def process_item(self, item, spider):
        text = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.filename.write(text.encode('utf-8'))
        return item

    def close_spider(self, spider):
        self.filename.close()
# -*- coding: utf-8 -*-
# @Time    : 2018-6-1 11:01
# @Author  : Amir
# @Site    : 
# @File    : qttutils.py
# @Software: PyCharm

'''
趣头条工具类
'''

import time
import os
import shutil
from .settings import DATA_PATH

class QttUtils:
    # 获取存储路径
    #
    # @param  [string] action [remove删除目录,默认create]
    # @return [string] path/year/month/day/*
    @staticmethod
    def getStorePath(action='create'):
        localtimes = time.localtime()
        year = time.strftime("%Y", localtimes)
        month = time.strftime('%m', localtimes)
        day = time.strftime('%d', localtimes)
        store_path = DATA_PATH + "/%s/%s/%s"%(year,month,day)

        # 删除目录
        if os.path.exists(store_path) and action == 'remove':
            shutil.rmtree(store_path)

        # 创建多级目录
        if not os.path.exists(store_path) and action == 'create':
            os.makedirs(store_path)

        return store_path
# -*- coding: utf-8 -*-

# Scrapy settings for QuTouTiao project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'QuTouTiao'

SPIDER_MODULES = ['QuTouTiao.spiders']
NEWSPIDER_MODULE = 'QuTouTiao.spiders'


#列表记录数
LIST_LIMIT = 10

# 储存路径
DATA_PATH = r'./data'
IMAGES_STORE = r'./image'

#分类
CATEGORY_INFO = [
    {"cid":255,"name":"推荐"},
    {"cid":1,"name":"热点"},
    {"cid":6,"name":"娱乐"},
    {"cid":5,"name":"养生"},
    {"cid":2,"name":"搞笑"},
    {"cid":7,"name":"科技"},
    {"cid":8,"name":"生活"},
    {"cid":10,"name":"财经"},
    {"cid":9,"name":"汽车"},
]

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'QuTouTiao (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'QuTouTiao.middlewares.QutoutiaoSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   'QuTouTiao.middlewares.UserAgent_CookiesMiddleware': 299,
}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'QuTouTiao.pipelines.QutoutiaoPipeline': 300,
    'QuTouTiao.pipelines.ContentImagePipeline': 301,
    'QuTouTiao.pipelines.CoverImagePipeline': 302
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

 

posted on 2018-06-02 10:28  小白Amir  阅读(878)  评论(0编辑  收藏  举报