scrapy+pymongo爬取小说实战
pymongo 的使用
-
pymongo 是Python操作mongodb的第三方库,操作起来方便简单
-
pymongo安装-官网连接
开始使用pymongo
# -*- coding: utf-8 -*-
import pymongo
# 获得与mongodb的连接
connection = pymongo.MongoClient()
# 创建数据库
tdb = connection.Jikexueyuan
# 使用数据库创建一个集合
post_info = tdb.test
# 字典
jike = {'name': '张三', 'age': 3, 'skill': 'python'}
god = {'name': '李四', 'age': 10, 'skill': 'pymongo'}
# 插入到创建的集合中
post_info.insert(jike)
post_info.insert(god)
#删除数据
post_info.remove({'name': '张三'})
# 查询
post_info.find()
scrapy中使用pymongo
- 在pipelines.py 中
from scrapy.conf import settings
import pymongo
class PymongoPipeline(object):
def __init__(self):
host = settings['MONGO_HOST']
port = settings['MONGO_PORT']
db = settings['MONGO_DB']
client = pymongo.MongoClient(host=host, port=port)
tdb = lient[db]
self.post = tdb[settings['MONGO_DOCNAME']
def process_item(self, item, spider):
bookInfo = dict(item)
self.post.insert(bookInfo)
return item
- 或者官网参考
import pymongo
class MongoPipeline(object):
collection_name = 'scrapy_items'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert_one(dict(item))
return item
- 到settings 中启动pipelines 配置
ITEM_PIPELINES = {
'myproject.pipelines.PricePipeline': 300,
'myproject.pipelines.JsonWriterPipeline': 800,
}
小说实战
-
小说网站 盗墓笔记网站 : http://www.daomubiji.com/
-
items
from scrapy imort Field, Item
class XsItem(Item):
bookName = Field()
bookTitle = Field()
chapterNum = Field()
chapterName = Field()
chapterURL = Field()
- spider中, 具体匹配规则请分析网页,这里只是个例
from scrapy.selector import Selector
def parse(self, response):
selector = Selector(response)
table = selector.xpath('//table')
for each in talbe:
bookName = each.xpath('tr/td[@colspan="3"]/center/h2/text()').extract()[0]
content = each.xpath('tr/td/a/@href').extract()
url = each.xpath('tr/td/a/@href').extract()
for i in range(len(url)):
# 需要导入你 自己的ITEM
item = XsItem()
item['bookName'] = bookName
item['chapterURL'] = url[i]
try:
item['bookTitle'] = content[i].split(' ')[0]
item['chapterNum'] = content[i].split(' ')[1]
except Exception,e:
continue
try:
item['chapterName'] = content[i].split(' ')[2]
except Exception, e:
item['chapterName'] = content[i].xplit(' ')[1][-3:]
yield item