scrapy爬虫框架
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NewItem(scrapy.Item):
# define the fields for your item here like:
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
# -*- coding: utf-8 -*-
import scrapy
from New.items import NewItem
class NewSpider(scrapy.Spider):
name = 'new'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
print(response.text)
quotes = response.css(".quote")
for quote in quotes:
item = NewItem()
text = quote.css(".text::text").extract_first()
author = quote.css(".author::text").extract_first()
tags = quote.css(".tags .tag::text").extract()
item['text'] = text
item['author'] = author
item['tags'] = tags
yield item
next = response.css(".pager .next a ::attr(href)").extract_first()
url = response.urljoin(next)
yield scrapy.Request(url=url,callback=self.parse)
# -*- coding: utf-8 -*-
import pymongo
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# import pymongo
class NewPipeline(object):
def __init__(self):
self.limit = 50
def process_item(self, item, spider):
if item['text']:
if len(item['text']) > self.limit:
item['text']=item['text'][0:self.limit].rstrip() + "..."
return item
else:
return DropItem('Missing text')
class MongoPipeline(object):
def __init__(self,mongo_url,mongo_db):
self.mongo_url = mongo_url
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_url = crawler.settings.get("MONGO_URL"),
mongo_db = crawler.settings.get("MONGO_DB")
)
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_url)
self.db = self.client[self.mongo_db]
def process_item(self,item,spider):
name = item.__class__.__name__
self.db[name].insert(dict(item))
return item
def close_spider(self,spider):
self.client.close()