scrapy爬虫框架

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class NewItem(scrapy.Item):
    # define the fields for your item here like:
    text = scrapy.Field()
    author = scrapy.Field()
    tags = scrapy.Field()

# -*- coding: utf-8 -*-
import scrapy
from New.items import NewItem


class NewSpider(scrapy.Spider):
    name = 'new'
    allowed_domains = ['quotes.toscrape.com']
    start_urls = ['http://quotes.toscrape.com/']

    def parse(self, response):
        print(response.text)
        quotes = response.css(".quote")
        for quote in quotes:
            item = NewItem()
            text = quote.css(".text::text").extract_first()
            author = quote.css(".author::text").extract_first()
            tags = quote.css(".tags .tag::text").extract()
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        next = response.css(".pager .next a ::attr(href)").extract_first()
        url = response.urljoin(next)
        yield scrapy.Request(url=url,callback=self.parse)

 

# -*- coding: utf-8 -*-
import pymongo
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# import pymongo


class NewPipeline(object):
    def __init__(self):
        self.limit = 50

    def process_item(self, item, spider):
        if item['text']:
            if len(item['text']) > self.limit:
                item['text']=item['text'][0:self.limit].rstrip() + "..."
            return item
        else:
            return DropItem('Missing text')
class MongoPipeline(object):
    def __init__(self,mongo_url,mongo_db):
        self.mongo_url = mongo_url
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls,crawler):
        return cls(
            mongo_url = crawler.settings.get("MONGO_URL"),
            mongo_db = crawler.settings.get("MONGO_DB")
            )

    def open_spider(self,spider):
        self.client = pymongo.MongoClient(self.mongo_url)
        self.db = self.client[self.mongo_db]

    def process_item(self,item,spider):
        name = item.__class__.__name__
        self.db[name].insert(dict(item))
        return item

    def close_spider(self,spider):
        self.client.close()

posted @ 2018-02-13 06:17  hyolyn  阅读(149)  评论(0编辑  收藏  举报