自顶向下redis4.0(5)持久化
redis4.0的持久化
简介
虽然redis
是内存数据库,但它也提供了持久化的功能。其中rdb
持久化可以定时备份用于回滚,而aof
持久化则更接近数据库最新的状态,服务器重启后可以恢复至最新的状态。两者数据备份的粒度不同,rdb
将整个数据库备份,aof
持久化粒度更为小,但生成的文件更大。如果有多个线程同时向磁盘写入,那么会增大磁盘的压力,最终导致线程阻塞,因此redis
在同一时间只允许一个持久化向磁盘写入数据。redis
默认配置关闭aof
持久化,开启rdb
后台持久化。由于aof
持久化数据较新,所以如果开启了aof
持久化,redis
启动时会选择加载aof
文件中的数据。
# 默认关闭aof
appendonly no
# after 900 sec (15 min) if at least 1 key changed
# after 300 sec (5 min) if at least 10 keys changed
# after 60 sec if at least 10000 keys changed
save 900 1
save 300 10
save 60 10000
正文
rdb持久化
redis
允许save
命令和bgsave
命令,还支持配置定期保存rdb
数据。
save命令
save
命令使用saveCommand
函数直接调用rdbSave
函数在主线程保存数据,线上模式不建议使用。在进一步介绍之前,我们先看一眼相关的成员。
struct redisServer {
/* RDB persistence */
pid_t rdb_child_pid; /* PID of RDB saving child */
char *rdb_filename; /* Name of RDB file */
long long dirty; /* Changes to DB from the last rdb save */
time_t lastsave; /* Unix time of last successful save */
int lastbgsave_status; /* C_OK or C_ERR */
}
如果已经有rdb
子进程在运行,则会直接返回。如果没有运行的子进程,则将数据存储到server.rdb_filename
文件中,默认为dump.rdb
。rdbSave
函数会打开一个临时文件,向其写入数据后,刷新数据到磁盘,然后重命名这个临时文件为dump.rdb
。然后重置server.dirty
为0
,设置lastsave
时间。
void saveCommand(client *c) {
if (server.rdb_child_pid != -1) {
addReplyError(c,"Background save already in progress");
return;
}
if (rdbSave(server.rdb_filename,null) == C_OK) {
addReply(c,shared.ok);
}
}
具体写入数据的操作位于rdbSaveRio
,它会先写入rdb
的版本,再写入一些辅助信息,然后将每个db
中的数据写入,最后写入校验码。
bgsave命令
bgsave
命令会调用fork
函数开启子进程,在子进程中调用rdbSave
函数。
和save
命令相同,如果有正在运行的子进程在存储数据,则会返回错误提示。但如果使用bgsave schedule
命令并且当前的子进程为aof
,则可以延迟调用bgsave
命令。
struct redisServer {
...
/* RDB persistence */
pid_t rdb_child_pid; /* PID of RDB saving child */
int child_info_pipe[2]; /* Pipe used to write the child_info_data. */
struct {
int process_type; /* AOF or RDB child? */
size_t cow_size; /* Copy on write size. */
unsigned long long magic; /* Magic value to make sure data is valid. */
} child_info_data;
...
};
后台启动rdb
就是调用fork
函数创建一个子进程,在子进程中调用rdbSave
函数。在调用fork
函数之前,redis
会先创建一个管道用于子进程向父进程的单向通信,fork
后的子进程会和父进程共享文件描述符,所以可以通过管道文件描述符单向通信。在子进程存储db
数据的时候,会修改内存空间,造成copy-on-write
,占用额外的内存空间,数据存储完成后,子进程会向父进程发送额外创建的内存大小。
fork(2) * The child inherits copies of the parent's set of open file descriptors. Each file descriptor in the child refers to the same open file description (see open(2)) as the corresponding file descriptor in the parent. This means that the two file descriptors share open file status flags, file offset, and signal- driven I/O attributes (see the description of F_SETOWN and F_SETSIG in fcntl(2)).
int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) {
pid_t childpid;
long long start;
if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;
openChildInfoPipe(); // 创建管道
start = ustime();
if ((childpid = fork()) == 0) {
//子进程
int retval;
closeListeningSockets(0); //因为会继承文件描述符,所以此处关闭套接字连接
redisSetProcTitle("redis-rdb-bgsave");
retval = rdbSave(filename,rsi);
if (retval == C_OK) {
size_t private_dirty = zmalloc_get_private_dirty(-1);
server.child_info_data.cow_size = private_dirty;
sendChildInfo(CHILD_INFO_TYPE_RDB);
}
exitFromChild((retval == C_OK) ? 0 : 1);
} else {
//父进程
serverLog(LL_NOTICE,"Background saving started by pid %d",childpid);
server.rdb_save_time_start = time(NULL);
server.rdb_child_pid = childpid;
server.rdb_child_type = RDB_CHILD_TYPE_DISK;
updateDictResizePolicy();
return C_OK;
}
return C_OK; /* unreached */
}
父进程此时记录子进程id rdb_child_pid
和类型。然后在之前注册的时间事件serverCron
中检查子进程是否结束。wait3
等待子进程的状态发送改变,可能是运行结束了,也可能是被信号量暂停或者恢复了。如果子进程已经结束则接受子进程通过管道发送的信息,也就是Copy-On-Write
的大小。然后关闭管道。
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
...
//如果有子进程在全量存储数据
if (server.rdb_child_pid != -1|| server.aof_child_pid != -1 ||
ldbPendingChildren())
{
int statloc;
pid_t pid;
if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
int exitcode = WEXITSTATUS(statloc);
int bysignal = 0;
if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc);
if (pid == server.rdb_child_pid) {
backgroundSaveDoneHandler(exitcode,bysignal);
if (!bysignal && exitcode == 0) receiveChildInfo();
}
updateDictResizePolicy();
closeChildInfoPipe();
}
}
}
由于我们此处是RDB
存储(与之相对的是AOF
重写,但如果开启RDB
格式存储,两者几乎等价),backgroundSaveDoneHandler
会调用backgroundSaveDoneHandlerDisk
函数。这里会将rdb_child_pid
等数据重置,如果保存成功,则更新server.dirty
以及lastsave
。
void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) {
if (!bysignal && exitcode == 0) {
serverLog(LL_NOTICE,
"Background saving terminated with success");
server.dirty = server.dirty - server.dirty_before_bgsave;
server.lastsave = time(NULL);
server.lastbgsave_status = C_OK;
} else if (!bysignal && exitcode != 0) {
serverLog(LL_WARNING, "Background saving error");
server.lastbgsave_status = C_ERR;
} else {
mstime_t latency;
serverLog(LL_WARNING,
"Background saving terminated by signal %d", bysignal);
latencyStartMonitor(latency);
rdbRemoveTempFile(server.rdb_child_pid);
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("rdb-unlink-temp-file",latency);
/* SIGUSR1 is whitelisted, so we have a way to kill a child without
* tirggering an error conditon. */
if (bysignal != SIGUSR1)
server.lastbgsave_status = C_ERR;
}
server.rdb_child_pid = -1;
server.rdb_child_type = RDB_CHILD_TYPE_NONE;
server.rdb_save_time_last = time(NULL)-server.rdb_save_time_start;
server.rdb_save_time_start = -1;
}
rdb定期保存数据
redis
默认添加3个定期保存参数,如果使用redis.conf
,则会清空默认配置使用redis.conf
配置。如果redis.conf
中没有配置,则不会使用rdb
定期保存。
appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
同样是在serverCron
函数中,如果当前没有aof
或者rdb
子进程存储数据,则会检测条件是否满足。如果(距离上一次写入的时间和数据变更的数量满足条件)并且(上一次写入成功或者距离上一次写入已经超过5秒钟,默认的CONFIG_BGSAVE_RETRY_DELAY
值) ,则启动rdb
序列化。
if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 ||
ldbPendingChildren())
{
...
} else {
/* If there is not a background saving/rewrite in progress check if
* we have to save/rewrite now. */
for (j = 0; j < server.saveparamslen; j++) {
struct saveparam *sp = server.saveparams+j;
/* Save if we reached the given amount of changes,
* the given amount of seconds, and if the latest bgsave was
* successful or if, in case of an error, at least
* CONFIG_BGSAVE_RETRY_DELAY seconds already elapsed. */
if (server.dirty >= sp->changes &&
server.unixtime-server.lastsave > sp->seconds &&
(server.unixtime-server.lastbgsave_try >
CONFIG_BGSAVE_RETRY_DELAY || // 值为5
server.lastbgsave_status == C_OK))
{
serverLog(LL_NOTICE,"%d changes in %d seconds. Saving...",
sp->changes, (int)sp->seconds);
rdbSaveInfo rsi, *rsiptr;
rsiptr = rdbPopulateSaveInfo(&rsi);
rdbSaveBackground(server.rdb_filename,rsiptr);
break;
}
}
/* Trigger an AOF rewrite if needed. */
...
}
进程结束保存数据
在redis
正常关闭的情况下(接受客户端shutdown
命令或者是收到terminal
信号),会调用prepareForShutdown
函数。该函数会关闭正在存储的子进程。如果有配置定期存储rdb
或者是关闭时有传入save
参数,则会在主线程中调用rdbSave
存储数据等,接着关闭进程。
可以看到在使用rdb
保存数据之前,如果开启了AOF
,那么redis
会调用flushAppendOnlyFile
强制将数据写入磁盘,并调用aof_fsync
保证数据刷新。
int prepareForShutdown(int flags) {
int save = flags & SHUTDOWN_SAVE;
int nosave = flags & SHUTDOWN_NOSAVE;
serverLog(LL_WARNING,"User requested shutdown...");
/* Kill all the Lua debugger forked sessions. */
ldbKillForkedSessions();
/* Kill the saving child if there is a background saving in progress.
We want to avoid race conditions, for instance our saving child may
overwrite the synchronous saving did by SHUTDOWN. */
if (server.rdb_child_pid != -1) {
serverLog(LL_WARNING,"There is a child saving an .rdb. Killing it!");
kill(server.rdb_child_pid,SIGUSR1);
rdbRemoveTempFile(server.rdb_child_pid);
}
if (server.aof_state != AOF_OFF) {
/* Kill the AOF saving child as the AOF we already have may be longer
* but contains the full dataset anyway. */
if (server.aof_child_pid != -1) {
/* If we have AOF enabled but haven't written the AOF yet, don't
* shutdown or else the dataset will be lost. */
if (server.aof_state == AOF_WAIT_REWRITE) {
serverLog(LL_WARNING, "Writing initial AOF, can't exit.");
return C_ERR;
}
serverLog(LL_WARNING,
"There is a child rewriting the AOF. Killing it!");
kill(server.aof_child_pid,SIGUSR1);
}
/* Append only file: flush buffers and fsync() the AOF at exit */
serverLog(LL_NOTICE,"Calling fsync() on the AOF file.");
flushAppendOnlyFile(1);
aof_fsync(server.aof_fd);
}
/* Create a new RDB file before exiting. */
if ((server.saveparamslen > 0 && !nosave) || save) {
serverLog(LL_NOTICE,"Saving the final RDB snapshot before exiting.");
/* Snapshotting. Perform a SYNC SAVE and exit */
rdbSaveInfo rsi, *rsiptr;
rsiptr = rdbPopulateSaveInfo(&rsi);
if (rdbSave(server.rdb_filename,rsiptr) != C_OK) {
/* Ooops.. error saving! The best we can do is to continue
* operating. Note that if there was a background saving process,
* in the next cron() Redis will be notified that the background
* saving aborted, handling special stuff like slaves pending for
* synchronization... */
serverLog(LL_WARNING,"Error trying to save the DB, can't exit.");
return C_ERR;
}
}
/* Remove the pid file if possible and needed. */
if (server.daemonize || server.pidfile) {
serverLog(LL_NOTICE,"Removing the pid file.");
unlink(server.pidfile);
}
/* Best effort flush of slave output buffers, so that we hopefully
* send them pending writes. */
flushSlavesOutputBuffers();
/* Close the listening sockets. Apparently this allows faster restarts. */
closeListeningSockets(1);
serverLog(LL_WARNING,"%s is now ready to exit, bye bye...",
server.sentinel_mode ? "Sentinel" : "Redis");
return C_OK;
}
aof持久化
数据缓冲区
上文已经提到,redis
在解析客户端请求到client-argc
和client-argv
后会调用processCommand
检查请求命令的条件是否满足,如果满足,则会调用call(client, CMD_CALL_FULL)
。
/* Command call flags, see call() function */
#define CMD_CALL_NONE 0
#define CMD_CALL_SLOWLOG (1<<0)
#define CMD_CALL_STATS (1<<1)
#define CMD_CALL_PROPAGATE_AOF (1<<2)
#define CMD_CALL_PROPAGATE_REPL (1<<3)
#define CMD_CALL_PROPAGATE (CMD_CALL_PROPAGATE_AOF|CMD_CALL_PROPAGATE_REPL)
#define CMD_CALL_FULL (CMD_CALL_SLOWLOG | CMD_CALL_STATS | CMD_CALL_PROPAGATE)
在这里,我们观察一下CMD_CALL_FULL
,此时我们只需要知道,该值包含CMD_CALL_PROPAGATE
。在调用完命令后,redis
会根据情况将命令追加到server->aof_buf
中,如果数据有发生改动,命令没有禁止propagate,并且redis
开启了aof
,则会将命令追加到缓冲区。
call(client *c, int flags) {
c->cmd->proc(c); //已经执行命令
/* Propagate the command into the AOF and replication link */
if (flags & CMD_CALL_PROPAGATE && // flag 就是 CMD_CALL_FULL
(c->flags & CLIENT_PREVENT_PROP) != CLIENT_PREVENT_PROP)
{
int propagate_flags = PROPAGATE_NONE;
//如果指令有造成数据变化
if (dirty) propagate_flags |= (PROPAGATE_AOF|PROPAGATE_REPL);
//有些命令强制propagete, 比如publishMessage
if (c->flags & CLIENT_FORCE_REPL) propagate_flags |= PROPAGATE_REPL;
if (c->flags & CLIENT_FORCE_AOF) propagate_flags |= PROPAGATE_AOF;
//有些命令禁止在此处propagate,比如spop,会在其他函数操作
if (c->flags & CLIENT_PREVENT_REPL_PROP ||
!(flags & CMD_CALL_PROPAGATE_REPL))
propagate_flags &= ~PROPAGATE_REPL;
if (c->flags & CLIENT_PREVENT_AOF_PROP ||
!(flags & CMD_CALL_PROPAGATE_AOF))
propagate_flags &= ~PROPAGATE_AOF;
/* Call propagate() only if at least one of AOF / replication
* propagation is needed. Note that modules commands handle replication
* in an explicit way, so we never replicate them automatically. */
if (propagate_flags != PROPAGATE_NONE && !(c->cmd->flags & CMD_MODULE))
propagate(c->cmd,c->db->id,c->argv,c->argc,propagate_flags);
}
}
void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,
int flags)
{
if (server.aof_state != AOF_OFF && flags & PROPAGATE_AOF)
feedAppendOnlyFile(cmd,dbid,argv,argc);
if (flags & PROPAGATE_REPL)
replicationFeedSlaves(server.slaves,dbid,argv,argc);
}
在追加命令之前,redis
还会做一些处理,如果命令对应的db
和上次追加命令的db
不同,则插入select
命令 。如果是expire
系列的命令,则全部切换成pexpireat
命令。如果是setex
命令,则拆分成set
和pexpireat
。如果此时没有子进程在重写,则写入到缓冲区,如果有子进程在重写,则尝试将数据发送给子进程。
void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
sds buf = sdsempty();
robj *tmpargv[3];
/* The DB this command was targeting is not the same as the last command
* we appended. To issue a SELECT command is needed. */
if (dictid != server.aof_selected_db) {
char seldb[64];
snprintf(seldb,sizeof(seldb),"%d",dictid);
buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
(unsigned long)strlen(seldb),seldb);
server.aof_selected_db = dictid;
}
if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
cmd->proc == expireatCommand) {
/* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */
buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
} else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {
/* Translate SETEX/PSETEX to SET and PEXPIREAT */
tmpargv[0] = createStringObject("SET",3);
tmpargv[1] = argv[1];
tmpargv[2] = argv[3];
buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
decrRefCount(tmpargv[0]);
buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
} else if (cmd->proc == setCommand && argc > 3) {
int i;
robj *exarg = NULL, *pxarg = NULL;
/* Translate SET [EX seconds][PX milliseconds] to SET and PEXPIREAT */
buf = catAppendOnlyGenericCommand(buf,3,argv);
for (i = 3; i < argc; i ++) {
if (!strcasecmp(argv[i]->ptr, "ex")) exarg = argv[i+1];
if (!strcasecmp(argv[i]->ptr, "px")) pxarg = argv[i+1];
}
serverAssert(!(exarg && pxarg));
if (exarg)
buf = catAppendOnlyExpireAtCommand(buf,server.expireCommand,argv[1],
exarg);
if (pxarg)
buf = catAppendOnlyExpireAtCommand(buf,server.pexpireCommand,argv[1],
pxarg);
} else {
buf = catAppendOnlyGenericCommand(buf,argc,argv);
}
/* Append to the AOF buffer. This will be flushed on disk just before
* of re-entering the event loop, so before the client will get a
* positive reply about the operation performed. */
if (server.aof_state == AOF_ON)
server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));
/* If a background append only file rewriting is in progress we want to
* accumulate the differences between the child DB and the current one
* in a buffer, so that when the child process will do its work we
* can append the differences to the new append only file. */
if (server.aof_child_pid != -1)
aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));
sdsfree(buf);
}
刷新数据到磁盘
appendonly no #关闭aof
# 开启aof后生效
# appendfsync always #aof 磁盘刷新策略
appendfsync everysec
# appendfsync no
redis
默认关闭aof
,如果关闭aof
则server->aof_buf
不会包含任何数据,只有开启了aof
,也就是appendonly yes
,才会往aof
中写入数据。
在配置appendonly yes
之后,appendfsync
配置才会生效,redis
默认配置为everysec
,也就是每秒尝试后台线程刷新数据到磁盘,但写入数据还是主线程写入的,只要有数据且没有子线程在写入数据,就会写入数据。
redis
刷新磁盘的操作也放在beforeSleep
中处理。如果读者看过该系列之前的文章,应该记得redis
返回客户端数据并不是直接发送给客户端,而是先将数据保存在client->buf
中,然后在下一轮的aeMainLoop
前的beforeSleep
函数中调用handleClientsWithPendingWrites
, 将数据返回给客户端。这样做的目的是为了兼容appendfysync always
的效果。所以在beforeSleep
函数中,刷新函数flushAppendOnlyFile
位于handleClientsWithPendingWrites
之前。
void beforeSleep(struct aeEventLoop *eventLoop) {
...
/* Write the AOF buffer on disk */
flushAppendOnlyFile(0);
/* Handle writes with pending output buffers. */
handleClientsWithPendingWrites();
}
刷新数据也有3种策略,下文会按照no
,always
,everysec
的顺序结合源码讲解。
appendfsync no
在不保证刷新的策略下,redis
也会调用flushAppendOnly
函数就等于直接调用aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
将数据写入系统缓冲区,但文件是否刷新到磁盘,以及什么时候刷新由系统决定。由于调用aofWrite
可能会遇到磁盘空间不够的问题,redis
会对比传入的数据长度和写入的数据长度,如果没有全部写入,为了保证下一次加载aof
文件能够顺利,reids
会裁剪掉部分写入的数据,等待下次重新写入。如果裁剪失败,则缩减aof_buf
的长度,删除aof_buf
中已经写入的部分,下次从最新的地方开始写入。并且如果写入系统缓冲区发送问题,则会在处理完问题后返回,而不会调用aof_sync
等刷新磁盘的函数。
void flushAppendOnlyFile(int force) {
ssize_t nwritten;
int sync_in_progress = 0;
mstime_t latency;
if (sdslen(server.aof_buf) == 0) return;
nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
if (nwritten != (ssize_t)sdslen(server.aof_buf)) {
static time_t last_write_error_log = 0;
//有写入数据
if (nwritten != -1) {
//将刚才写入的数据裁剪掉
//todo what will happen if system ftruncate the file some part is still in the memory not yet flushed to the disk
if (ftruncate(server.aof_fd, server.aof_current_size) != -1) {
//裁剪成功
nwritten = -1;
}
server.aof_last_write_errno = ENOSPC;
}
server.aof_last_write_status = C_ERR;
//如果裁剪失败
if (nwritten > 0) {
server.aof_current_size += nwritten;
sdsrange(server.aof_buf,nwritten,-1);
}
return; /* We'll try again on the next call... */
}
server.aof_current_size += nwritten;
if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
sdsclear(server.aof_buf);
} else {
sdsfree(server.aof_buf);
server.aof_buf = sdsempty();
}
//下面是刷新磁盘的操作
}
appendfysnc always
always
模式保证客户端接受返回数据后,redis
一定已经将数据变化刷新回磁盘。采用该模式相当于redis
在主线程中调用完aofWrite
函数后,紧接着调用了aof_sync
函数,也就是fsync
系列的函数。该模式迫使redis
在主线程访问磁盘,会导致性能极具下降。并且always
的容错性较差,如果aofWrite
没有将aof_buf
中的全部数据写入,redis
会立刻退出。
appendfysnc everysec
每秒刷新一次数据到磁盘是redis
的默认配置,它会尝试每秒刷新文件到磁盘。由于flushAppendOnlyFile
在serverCron
中被调用,而serverCron
的频率为10次/秒,所以redis
默认写入数据的频率和刷新数据的频率为10:1。如果开启了aof_no_fsync_on_rewrite
,则不会在有子进程全量存储的时候(包括rdb
存储和aof
重写)同步增量aof
数据。
void flushAppendOnlyFile(int force) {
ssize_t nwritten;
int sync_in_progress = 0;
mstime_t latency;
if (sdslen(server.aof_buf) == 0) return;
// 查看是否有子线程在同步数据
if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
sync_in_progress = bioPendingJobsOfType(BIO_AOF_FSYNC) != 0;
if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
if (sync_in_progress) {
//如果有另外的线程在写入数据,则等待一个postponed的循环和2秒
if (server.aof_flush_postponed_start == 0) {
server.aof_flush_postponed_start = server.unixtime;
return;
} else if (server.unixtime - server.aof_flush_postponed_start < 2) {
return;
}
//如果还没有处理完,则继续写入,实际上会阻塞
}
}
nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
server.aof_flush_postponed_start = 0;
if (nwritten != (ssize_t)sdslen(server.aof_buf)) {
//上文已经介绍,如果写入的数据不全,则返回
...
return; /* We'll try again on the next call... */
}
//此时数据已写入系统缓冲区,刷新`aof_buf`的缓冲区
sdsfree(server.aof_buf);
server.aof_buf = sdsempty();
/* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
* children doing I/O in the background. */
if (server.aof_no_fsync_on_rewrite &&
(server.aof_child_pid != -1 || server.rdb_child_pid != -1))
return;
if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
server.unixtime > server.aof_last_fsync)) {
if (!sync_in_progress) aof_background_fsync(server.aof_fd);
server.aof_last_fsync = server.unixtime;
}
}
redis
在将数据写入磁盘时,会在主线程调用write
函数,然后在另外的线程中调用fsync
函数。这样能够让另外一个线程阻塞在IO
上而不影响主线程的操作,但需要注意的是如果另一个线程fsync
函数如果没有返回,主线程就调用write
函数,那么主线程也会阻塞在write
函数上。[4]
《Redis开发与运维》[3]中提到
通过对AOF阻塞流程可以发现两个问题:
1) everysec配置最多可能丢失2秒数据, 不是1秒
2) 如果系统fsync缓慢, 将会导致Redis主线程阻塞影响效率。
实际上在redis
4.0版本中,everysec
配置最多可能丢失2秒加上一个aeMainLoop
循环的时间。虽然《Redis开发与运维》指出了两个问题,但实际上它们是同一个问题,那就是磁盘写入速度无法承受过量的数据。在使用everysec
配置时,如果发生这个问题,redis
首先考虑主线程的运行,如果距离上一次延迟写入的时间戳aof_flush_postponed_start
小于2秒,那么先跳过这一次的写入,避免阻塞以保证主线程能够处理请求。如果2秒后数据还没有从缓冲区刷新到磁盘,那么将会调用aofWrite
导致主线程阻塞。
aof重写
aof重写的配置
aof
重写可以输入指令触发bgrewriteaof
,也可以配置条件触发重写。
auto-aof-rewrite-min-size 64mb
auto-aof-rewrite-percentage 100
仅仅这两个配置还不能了解清楚redis
何时重写,我们还需要有aof_current_size
和aof_base_size
,aof_current_size
就是aof
文件当前的大小,redis
启动加载aof
文件或者每次aof
追加数据都会更新这个值,这个值并不会存储到磁盘中,aof_base_size
也是同理,如果启动时有加载aof
文件,那么aof_base_size
的值就是aof
文件的大小。
当aof_current_size
>auto-aof-rewrite-min-size
并且有配置auto-aof-rewrite-percentage
时,如果(aof_current_size
-aof_base_size
)/100
>= percentage
,则会自动重写。比如按照上文的配置,redis
启动时加载的aof
文件大小为100mb
,那么aof_base_size
就是100mb
,当redis
文件增长到200mb
的时候就会自动重写。
但是会存在这样一种情况,redis
文件增长到199mb
的时候,刚好重启了,那么下次启动的时候,aof_base_size
就和aof_current_size
大小相等,想要触发自动重写,就要等到redis
文件大小增长到400mb
左右。如果数据增长地比较缓慢,或者是百分比配置较大。在触发重写之前,redis
就关闭或者重启了。那么aof_base_size
下次启动的时候会被刷新成aof_current_size
的大小,导致可能永远无法触发自动重写。
aof重写的优先级
aof
重写的优先级低于rdb
,如果两者的触发条件同时满足,redis
会优先处理rdb
存储。观察源代码,可以发现rdb
存储先于aof
,如果rdb
此处触发,即使aof
触发重写的条件满足,因为server.rdb_child_pid
将不为-1
,导致无法进入aof
重写。
serverCron(aeEventLoop*, longlong, void*) {
if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 ||
ldbPendingChildren()) {
//... 检查子进程是否结束并处理。
} else {
/* If there is not a background saving/rewrite in progress check if
* we have to save/rewrite now. */
for (j = 0; j < server.saveparamslen; j++) {
...
//..处理rdb自动存储
}
/* Trigger an AOF rewrite if needed. */
if (server.aof_state == AOF_ON &&
server.rdb_child_pid == -1 &&
server.aof_child_pid == -1 &&
server.aof_rewrite_perc &&
server.aof_current_size > server.aof_rewrite_min_size)
{
long long base = server.aof_rewrite_base_size ?
server.aof_rewrite_base_size : 1;
long long growth = (server.aof_current_size*100/base) - 100;
if (growth >= server.aof_rewrite_perc) {
serverLog(LL_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);
rewriteAppendOnlyFileBackground();
}
}
}
}
aof 重写的来龙去脉
rewriteAppendOnlyFileBackground
会创建许多管道用于父子间通信。
childInfoPipe
用于子进程向父进程提示有多少个Copy-On-Write
内存。aof_pipe_write_data_to_child
用于父进程向aof
重写子进程发送最近的数据变更。aof_pipe_write_ack_to_parent
和aof_pipe_write_ack_to_child
用于等待彼此的确认消息。
并且注册了aof_pipe_read_ack_from_child
的文件事件,当子进程向父进程发送中止请求的时候,就会调用aof_pipe_read_ack_from_child
函数。
int aofCreatePipes(void) {
int fds[6] = {-1, -1, -1, -1, -1, -1};
int j;
if (pipe(fds) == -1) goto error; /* parent -> children data. */
if (pipe(fds+2) == -1) goto error; /* children -> parent ack. */
if (pipe(fds+4) == -1) goto error; /* parent -> children ack. */
/* Parent -> children data is non blocking. */
if (anetNonBlock(NULL,fds[0]) != ANET_OK) goto error;
if (anetNonBlock(NULL,fds[1]) != ANET_OK) goto error;
//注意:
//这里注册了一个文件事件
if (aeCreateFileEvent(server.el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error;
server.aof_pipe_write_data_to_child = fds[1];
server.aof_pipe_read_data_from_parent = fds[0];
server.aof_pipe_write_ack_to_parent = fds[3];
server.aof_pipe_read_ack_from_child = fds[2];
server.aof_pipe_write_ack_to_child = fds[5];
server.aof_pipe_read_ack_from_parent = fds[4];
server.aof_stop_sending_diff = 0;
return C_OK;
error:
serverLog(LL_WARNING,"Error opening /setting AOF rewrite IPC pipes: %s",
strerror(errno));
for (j = 0; j < 6; j++) if(fds[j] != -1) close(fds[j]);
return C_ERR;
}
父进程 创建完子进程后,父进程会更新aof_child_pid
记录子进程id
,虽然只更新了一个字段,但意味着已经开启了一个很有可能影响redis
性能的任务。
子进程 先向临时文件写入当前数据库的内容,如果开启了aof_use_rdb_preamble
(默认关闭,但建议开启),那么就会写入rdb
数据,也就是db
数据全量存储,否则按aof
追加模式,全量存储db
中的内容,接着刷新数据到磁盘,阻塞。
//in function rewriteAppendOnlyFile(char* filename)
if (server.aof_use_rdb_preamble) {
int error;
if (rdbSaveRio(&aof,&error,RDB_SAVE_AOF_PREAMBLE,NULL) == C_ERR) {
errno = error;
goto werr;
}
} else {
if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;
}
/* Do an initial slow fsync here while the parent is still sending
* data, in order to make the next final fsync faster. */
if (fflush(fp) == EOF) goto werr;
if (fsync(fileno(fp)) == -1) goto werr;
父进程 在aof
子进程等待数据刷新的时候,继续处理请求,并且将数据追加到server.aof_rewrite_buf_blocks
,如果没有注册aof_pipe_write_data_to_child
(是个管道,也就是文件描述符)文件事件的话,会将该管道和aofChildWriteDiffData
绑定,如果管道可写,则会将server.aof_rewrite_buf_blocks
中的数据写入管道发送给子进程。这样保证了父进程不会因为向管道写入数据而阻塞。
/* Append data to the AOF rewrite buffer, allocating new blocks if needed. */
void aofRewriteBufferAppend(unsigned char *s, unsigned long len) {
listNode *ln = listLast(server.aof_rewrite_buf_blocks);
aofrwblock *block = ln ? ln->value : NULL;
while(len) {
...
// 一直将数据写入aof_rewrite_buf_block
}
//注册文件事件
if (aeGetFileEvents(server.el,server.aof_pipe_write_data_to_child) == 0) {
aeCreateFileEvent(server.el, server.aof_pipe_write_data_to_child,
AE_WRITABLE, aofChildWriteDiffData, NULL);
}
}
子进程 刷新完之前的数据后,会在1秒内一直读取来自父进程的数据,将其写入到aof_child_diff
中。然后向父进程发送停发数据请求。
//in function rewriteAppendOnlyFile(char* filename)
mstime_t start = mstime();
while(mstime()-start < 1000 && nodata < 20) {
if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0)
{
nodata++;
continue;
}
nodata = 0; /* Start counting from zero, we stop on N *contiguous*
timeouts. */
aofReadDiffFromParent();
}
if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr;
父进程 在aeMainLoop
中检测到aof_pipe_read_ack_from_child
管道可读事件(在创建管道的时候注册,请看前文),调用aofChildPipeReadable
函数,将aof_stop_sending_diff
设置为1,父进程不会再将aof_rewrite_buf_blocks
缓冲区的内容写给子进程。并向子进程发送消息表示已经收到停发请求。
子进程 接受到父进程的同意后,最后读取一次数据,因为在父进程接受到停发请求前可能又发送了数据。至此,停发请求前的额外aof
增量数据都已写入aof_child_diff
。接着子进程将其写入文件并刷新,退出子进程。
if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 ||
byte != '!') goto werr;
aofReadDiffFromParent();
if (rioWrite(&aof,server.aof_child_diff,sdslen(server.aof_child_diff)) == 0)
goto werr;
/* Make sure data will not remain on the OS's output buffers */
if (fflush(fp) == EOF) goto werr;
if (fsync(fileno(fp)) == -1) goto werr;
if (fclose(fp) == EOF) goto werr;
父进程 在serverCron
函数中调用wait3
检测到aof
重写子进程的退出,会调用backgroundRewriteDoneHandler
处理。
它首先会打开之前保存的临时文件,将中止请求后的追加数据aof_rewrite_buf_blocks
写入文件(注意:虽然子进程之前请求中止发送数据了,但因为rdb_child_pid
直到现在还是保存的子进程的id,会一直接受追加数据到aof_rewrite_buf_blocks
)。此时已经将所有的数据都写入aof
临时文件。接下来就是将临时文件替换为aof
保存的文件名。
rdb对比aof
官网有一篇文章《persistence》已经做了比对,在此不再赘述。
参考文献
[1]《Redis 源码》
[2]《Redis开发与运维》
[3]《Redis设计与实现》
[4]《fsync() on a different thread: apparently a useless trick》
[7]《wait3(2) - Linux man page》