pyspider 数据存储到mongoDB中
1、新建文件config.json,内容如下,文件放在pyspider文件夹下,路径为C:\Windows\System32\cmd.exe
{
"taskdb": "mongodb+taskdb://127.0.0.1:27017/pyspider_taskdb",
"projectdb": "mongodb+projectdb://127.0.0.1:27017/pyspider_projectdb",
"resultdb": "mongodb+resultdb://127.0.0.1:27017/pyspider_resultdb",
"message_queue": "redis://127.0.0.1:6379/0",
"webui": {
"port": 5000
}
}
2、安装redis,在redis文件夹下启动终端,运行命令启动redis服务端
E:\redis>redis-server.exe redis.windows.conf
redis默认15个数据库,db0,db1...上述文件选择index为0的db数据库
若想启动客户端,运行命令如下,set,get为测试
E:\redis>redis-cli.exe -h 127.0.0.1 -p 6379
127.0.0.1:6379> set myKey abc
OK
127.0.0.1:6379> get myKey
"abc"
127.0.0.1:6379>
3、安装mongoDB,建文件夹db,并配置到mongoDB里去(文件夹不建也行)
在bin文件夹下运行命令
E:\MongoDB\Server\4.0\bin>mongod.exe --dbpath \data\db
在客户端运行一些查询命令
show dbs
查看有哪些数据库
db
查看当前数据库
use dbname
使用dbname数据库作为当前数据库
show tables / show collections
查看当前数据库下的表或集合,都指一个意思
db.website.find()
查看当前数据库下website集合的数据内容
db.website.find().count()
查看website表里数据总数
4、启动redis,启动mongoDB后,启动pyspider,并把新加的配置文件配置进去
D:\Python\Python36\Lib\site-packages\pyspider>pyspider --config config.json
5、发现需要安装第三方模块
pip install redis
pip install pymongo
6、在项目里重载函数on_result
import pymongo
def on_result(self,result): if not result: #提取每个链接都会调用这个函数,只有detail_page返回的result才有值,所以没值时不继续进行 return client = pymongo.MongoClient(host='127.0.0.1',port=27017) db = client['pyspider_projectdb'] #建数据库,也可以是配置文件里设置的数据库 coll = db['website'] #建集合,即表 data = { 'originalLink':result['originalLink'], 'productName':result['productName'], 'price':result['price'], 'productDescription':result['productDescription'], 'category1':result['category1'], 'category2':result['category2'], 'category3':result['category3'], 'images':result['images'] } data_id = coll.insert(data) #将数据插入集合里 print(data_id)
7、完整代码
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2018-11-08 09:56:40 # Project: product from pyspider.libs.base_handler import * import re import base64 import os import urllib import urllib.request import requests import json import pymongo import uuid class Handler(BaseHandler): def default(self, obj): if isinstance(obj, bytes): return str(obj, encoding='utf-8') return json.JSONEncoder.default(self, obj) crawl_config = { "headers": { "User-Agent": "BaiDuSpider", } } @every(minutes=24 * 60) def on_start(self): self.crawl('https://www.zhe800.com/', callback=self.index_page, validate_cert=False) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): if re.match('https://shop.zhe800.com/products/.+',each.attr.href): self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False, connect_timeout = 50, timeout = 200) elif re.match('https://brand.zhe800.com/.+',each.attr.href): self.crawl(each.attr.href, callback=self.index_page, validate_cert=False, connect_timeout = 50, timeout = 200) @config(priority=2) def detail_page(self, response): if not response.doc('h1').text(): return x=1 imageresult=[]#放图片对象 results=[]#最终结果,全部json放里 description='' result=dict()#放json headers = {"Content-Type": "application/json"} path='D:\\pythonlianxi\\testimg' if not os.path.isdir(path): os.makedirs(path) paths = path+'\\' for img in response.doc('div[class="deteilpic l"]>UL>LI>A>IMG').items(): if re.match('.+?\.jpg',img.attr.src): urllib.request.urlretrieve(img.attr.src,'{0}{1}.jpg'.format(paths,x)) with open(paths+str(x)+".jpg","rb") as f: base64_data = base64.b64encode(f.read()).decode() imgurl=dict()#放base64 imgurl['id']=x imgurl['base64']=base64_data imageresult.append(imgurl) x = x + 1 for each in response.doc('aside[class="pos area"]').items(): catagoary=each.text() try: catagoary1=catagoary.split(' > ')[1] except: catagoary1="category1" try: catagoary2=catagoary.split(' > ')[2] except: catagoary2="category2" try: catagoary3=catagoary.split(' > ')[3] except: catagoary3="category3" pricebefore = response.doc('strong[class="red js_price_st"]>I').text() try: price=float(pricebefore) except: pricearray = re.findall('[0-9]*\.?[0-9]+', pricebefore) if not len(pricearray): pricearray=[0] price=pricearray[0] for des in response.doc('ul[class="list12 clear"]>LI').items(): if des.attr.title: description=description+des.attr.title result['id']=''.join(str(uuid.uuid4()).split('-')) result['originalLink']=response.url result['productName']=response.doc('h1').text() result['price']=price result['productDescription']=description result['category1']=catagoary1 result['category2']=catagoary2 result['category3']=catagoary3 result['images']=imageresult filename="D:\\pythonlianxi\\zhe800.txt" with open(filename,'+a') as f: f.write(str(result)+'\n') results.append(result) payload=json.dumps(results) #r = requests.post('http://192.168.1.160:8764/index/products', data=payload, headers=headers) return { 'id':result['id'], 'price':price, "originalLink": response.url, "productName": response.doc('h1').text(), 'productDescription':description, 'category1':catagoary1, 'category2':catagoary2, 'category3':catagoary3, 'images':imageresult } def on_result(self,result): if not result: return print(result) client = pymongo.MongoClient(host='127.0.0.1',port=27017) db = client['pyspider_projectdb'] coll = db['productzhe'] data = { 'id':result['id'], 'originalLink':result['originalLink'], 'productName':result['productName'], 'price':result['price'], 'productDescription':result['productDescription'], 'category1':result['category1'], 'category2':result['category2'], 'category3':result['category3'], 'images':result['images'] } data_id = coll.insert(data)