同步存储
import pymysql
class ScrapyMoviePipeline:
def __init__(self):
self.conn = pymysql.connect(
host='127.0.0.1',
user='root',
passwd='admin@123',
db='scrapy_movie'
)
self.cur = self.conn.cursor()
def process_item(self, item, spider):
sql = 'INSERT INTO tbl_scrapy (name,url) VALUES (%s,%s)'
self.cur.execute(sql, (item["name"], item["url"]))
self.conn.commit()
def close_spider(self, spider):
self.cur.close()
self.conn.close()
异步存储
import pymysql
from twisted.enterprise import adbapi
class ScrapyMoviePipeline:
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值
"""
数据库建立连接
:param settings: 配置参数
:return: 实例化参数
"""
params = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
password=settings['MYSQL_PASSWORD'],
cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
)
# 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
dbpool = adbapi.ConnectionPool('pymysql', **params)
# 返回实例化参数
return cls(dbpool)
def process_item(self, item, spider):
"""
使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
"""
query = self.dbpool.runInteraction(self.do_database, item) # 指定操作方法和操作数据
# 添加异常处理
query.addCallback(self.handle_error) # 处理异常
def do_database(self, cursor, item):
# 定义sql语句
insert_sql = 'insert into tbl_scrapy (name,url) values (%s,%s)'
select_sql = 'select * from tbl_scrapy where name="%s" and url="%s"' % (item["name"], item["url"])
rep = cursor.execute(select_sql)
# 判断数据库中是否存在该数据,没有则插入
if rep == 0:
cursor.execute(insert_sql, (item["name"], item["url"]))
def handle_error(self, failure):
if failure:
# 打印错误信息
print(failure)