结合consul raft库理解raft
一 入口 github.com/hashicorp/consul/agent/consul/server.go
func (s *Server) setupRaft() error {
状态机,用于consul 信息的查询,如kv等
s.fsm, err = fsm.New(s.tombstoneGC, s.config.LogOutput)
...
用于raft的rpc
trans := raft.NewNetworkTransportWithConfig(transConfig)
...
store, err := raftboltdb.NewBoltStore(filepath.Join(path, "raft.db"))
...
持久化任期和选举的信息
stable = store
cacheStore, err := raft.NewLogCache(raftLogCacheSize, store)
...
存储日志
log = cacheStore
...
snapshots, err := raft.NewFileSnapshotStore(path, snapshotsRetained, s.config.LogOutput)
...
快照
snap = snapshots
...
创建raft
s.raft, err = raft.NewRaft(s.config.RaftConfig, s.fsm, log, stable, snap, trans)
...
}
二 对应的函数简介
1 状态机
github.com/hashicorp/consul/agent/consul/fsm/fsm.go中
func New(gc *state.TombstoneGC, logOutput io.Writer) (*FSM, error) {
初始化一个内存数据库
stateNew, err := state.NewStateStore(gc)
...
}
func (c *FSM) Apply(log *raft.Log) interface{} {
...
查找日志对应的处理函数
if fn := c.apply[msgType]; fn != nil {
return fn(buf[1:], log.Index)
}
...
}
github.com/hashicorp/consul/agent/consul/fsm/commands_oss.go中
注册日志处理函数
func init() {
...
registerCommand(structs.KVSRequestType, (*FSM).applyKVSOperation)
...
}
func (c *FSM) applyKVSOperation(buf []byte, index uint64) interface{} {
...
switch req.Op {
case api.KVSet:
return c.state.KVSSet(index, &req.DirEnt)
...
}
...
}
github.com/hashicorp/consul/agent/consul/state/state_store.go中
内存数据库
func NewStateStore(gc *TombstoneGC) (*Store, error) {
...
...
}
github.com/hashicorp/consul/agent/consul/state/kvs.go
func (s *Store) KVSSet(idx uint64, entry *structs.DirEntry) error {
tx := s.db.Txn(true)
defer tx.Abort()
if err := s.kvsSetTxn(tx, idx, entry, false); err != nil {
return err
}
tx.Commit()
return nil
}
2 raft rpc
处理rpc请求
github.com/hashicorp/raft/net_transport.go中
func NewNetworkTransportWithConfig(
config *NetworkTransportConfig,
) *NetworkTransport {
...
go trans.listen()
...
}
监听
func (n *NetworkTransport) listen() {
for{
...
go n.handleConn(conn)
}
处理和响应入口
func (n *NetworkTransport) handleConn(conn net.Conn) {
for {
if err := n.handleCommand(r, dec, enc); err != nil {
...
}
if err := w.Flush(); err != nil {
...
}
}
处理入库
func (n *NetworkTransport) handleCommand(r *bufio.Reader, dec *codec.Decoder, enc *codec.Encoder) error {
...
select {
发送给请求消费方
case n.consumeCh <- rpc:
case <-n.shutdownCh:
return ErrTransportShutdown
}
...
select {
case resp := <-respCh:
case <-n.shutdownCh:
return ErrTransportShutdown
}
...
}
消费方获取consumech,这个对应的是raft里的rpcch
func (n *NetworkTransport) Consumer() <-chan RPC {
return n.consumeCh
}
3 raft
github.com/hashicorp/raft/api.go中
func NewRaft(conf *Config, fsm FSM, logs LogStore, stable StableStore, snaps SnapshotStore, trans Transport) (*Raft, error)
{
...
启动raft流程
r.goFunc(r.run)
启动状态机
r.goFunc(r.runFSM)
启动定期生成快照
r.goFunc(r.runSnapshots)
...
}
github.com/hashicorp/raft/raft.go中
raft身份的
func (r *Raft) run() {
...
switch r.getState() {
case Follower:
r.runFollower()
case Candidate:
r.runCandidate()
case Leader:
r.runLeader()
}
...
}
跟随者
func (r *Raft) runFollower() {
for {
选举复制等rpc请求
case rpc := <-r.rpcCh:
r.processRPC(rpc)
}
...
非leader拒绝apply
case a := <-r.applyCh:
a.respond(ErrNotLeader)
非leader拒绝认证leader
case v := <-r.verifyCh:
v.respond(ErrNotLeader)
...
来自leader的心跳超时
case <-heartbeatTimer:
lastContact := r.LastContact()
if time.Now().Sub(lastContact) < r.conf.HeartbeatTimeout {
continue
}
...
成为候选人
r.setState(Candidate)
...
}
候选人
func (r *Raft) runCandidate() {
选自己
voteCh := r.electSelf()
随机选举超时
electionTimer := randomTimeout(r.conf.ElectionTimeout)
多数派
votesNeeded := r.quorumSize()
,,,
for r.getState() == Candidate {
这时候有复制复制的话,会回退到follower
case rpc := <-r.rpcCh:
r.processRPC(rpc)
case vote := <-voteCh:
对方任期更大,自己回退到follower
if vote.Term > r.getCurrentTerm() {
r.setState(Follower)
r.setCurrentTerm(vote.Term)
return
}
收到选票
if vote.Granted {
grantedVotes++
}
达成多数派
if grantedVotes >= votesNeeded {
r.setState(Leader)
r.setLeader(r.localAddr)
return
}
}
}
func (r *Raft) electSelf() <-chan *voteResult {
任期+1
r.setCurrentTerm(r.getCurrentTerm() + 1)
获取持久化的最新日志index和任期
lastIdx, lastTerm := r.getLastEntry()
req := &RequestVoteRequest{
RPCHeader: r.getRPCHeader(),
Term: r.getCurrentTerm(),
Candidate: r.trans.EncodePeer(r.localID, r.localAddr),
LastLogIndex: lastIdx,
LastLogTerm: lastTerm,
}
askPeer := func(peer Server) {
resp := &voteResult{voterID: peer.ID}
err := r.trans.RequestVote(peer.ID, peer.Address, req, &resp.RequestVoteResponse)
if err != nil {
r.logger.Printf("[ERR] raft: Failed to make RequestVote RPC to %v: %v", peer, err)
resp.Term = req.Term
resp.Granted = false
}
respCh <- resp
}
for _, server := range r.configurations.latest.Servers {
判断peer是否有选举权
if server.Suffrage == Voter {
...
}
}
挑选日志复制和选举rpc分析
func (r *Raft) processRPC(rpc RPC) {
...
switch cmd := rpc.Command.(type) {
case *AppendEntriesRequest:
r.appendEntries(rpc, cmd)
case *RequestVoteRequest:
r.requestVote(rpc, cmd)
...
}
}
日志复制
func (r *Raft) appendEntries(rpc RPC, a *AppendEntriesRequest) {
忽略更小的任期
if a.Term < r.getCurrentTerm() {
return
}
收到更大任期的复制,自己会退到follower,更新任期
if a.Term > r.getCurrentTerm() || r.getState() != Follower {
r.setState(Follower)
r.setCurrentTerm(a.Term)
resp.Term = a.Term
}
if a.PrevLogEntry > 0 {
lastIdx, lastTerm := r.getLastEntry()
var prevLogTerm uint64
if a.PrevLogEntry == lastIdx {
prevLogTerm = lastTerm
} else {
var prevLog Log
if err := r.logs.GetLog(a.PrevLogEntry, &prevLog); err != nil {
复制中会用来知道logindex不匹配,需要回退匹配
resp.NoRetryBackoff = true
return
}
prevLogTerm = prevLog.Term
}
if a.PrevLogTerm != prevLogTerm {
resp.NoRetryBackoff = true
return
}
...
resp.Success = true
r.setLastContact()
return
}
启动领导
func (r *Raft) runLeader() {
启动复制
r.startStopReplication()
当选后发送一个noop日志,启动复制流程,否则会日志安全问题
noop := &logFuture{
log: Log{
Type: LogNoop,
},
}
r.dispatchLogs([]*logFuture{noop})
r.leaderLoop()
}
领导执行循环
func (r *Raft) leaderLoop() {
...
for r.getState() == Leader {
case rpc := <-r.rpcCh:
r.processRPC(rpc)
follower复制时响应的任期大于自己的任期,需要回退到follower
case <-r.leaderState.stepDown:
r.setState(Follower)
apply达成多数派
case <-r.leaderState.commitCh:
...
响应上层的命令
case newLog := <-r.applyCh:
ready := []*logFuture{newLog}
for i := 0; i < r.conf.MaxAppendEntries; i++ {
select {
case newLog := <-r.applyCh:
ready = append(ready, newLog)
default:
break
}
}
...
r.dispatchLogs(ready)
租约到期
case <-lease:
检查租约有效性
maxDiff := r.checkLeaderLease()
checkInterval := r.conf.LeaderLeaseTimeout - maxDiff
if checkInterval < minCheckInterval {
checkInterval = minCheckInterval
}
lease = time.After(checkInterval)
}
}
检查租约有效性,如果距离和大部分的follower的交互时间点的时间段中最大并未超过租约时长,那么可以继续leader,否则回退follower
func (r *Raft) checkLeaderLease() time.Duration {
var maxDiff time.Duration
now := time.Now()
for peer, f := range r.leaderState.replState {
diff := now.Sub(f.LastContact())
if diff <= r.conf.LeaderLeaseTimeout {
contacted++
if diff > maxDiff {
maxDiff = diff
}
} else {
...
}
quorum := r.quorumSize()
if contacted < quorum {
r.setState(Follower)
}
}
分发日志复写
func (r *Raft) dispatchLogs(applyLogs []*logFuture) {
for _, f := range r.leaderState.replState {
asyncNotifyCh(f.triggerCh)
}
}
启动复制
func (r *Raft) startStopReplication() {
for _, server := range r.configurations.latest.Servers {
...
r.goFunc(func() { r.replicate(s) })
...
}
}
复制入口
func (r *Raft) replicate(s *followerReplication) {
...
r.goFunc(func() { r.heartbeat(s, stopHeartbeat) })
...
for !shouldStop {
...
case <-s.triggerCh:
lastLogIdx, _ := r.getLastLog()
shouldStop = r.replicateTo(s, lastLogIdx)
...
}
}
复制操作
func (r *Raft) replicateTo(s *followerReplication, lastIndex uint64) (shouldStop bool) {
...
日志定期会进行快照,然后删除,如果跟随者的log index本地未找到,那就发送日志快照
if err := r.setupAppendEntries(s, &req, s.nextIndex, lastIndex); err == ErrLogNotFound {
goto SEND_SNAP
}else if err!=nil {
return
}
...
if err := r.trans.AppendEntries(s.peer.ID, s.peer.Address, &req, &resp); err != nil {
...
}
...
if resp.Success {
updateLastAppended(s, &req)
...
} else {
回退nextindex 进行尝试复制
s.nextIndex = max(min(s.nextIndex-1, resp.LastLog+1), 1)
if resp.NoRetryBackoff {
s.failures = 0
} else {
s.failures ++
}
}
if s.nextIndex <= lastIndex {
goto START
}
发送日志快照
SEND_SNAP:
if stop, err := r.sendLatestSnapshot(s); stop {
return true
} else if err != nil {
r.logger.Printf("[ERR] raft: Failed to send snapshot to %v: %v", s.peer, err)
return
}
}
成功复制给一个follower,更新状态
func updateLastAppended(s *followerReplication, req *AppendEntriesRequest) {
if logs := req.Entries; len(logs) > 0 {
last := logs[len(logs)-1]
s.nextIndex = last.Index + 1
s.commitment.match(s.peer.ID, last.Index)
}
}
更新follower的复制index
func (c *commitment) match(server ServerID, matchIndex uint64) {
if prev, hasVote := c.matchIndexes[server]; hasVote && matchIndex > prev {
c.matchIndexes[server] = matchIndex
c.recalculate()
}
}
计算是否就一条日志复制达成多数派
func (c *commitment) recalculate() {
...
复制达成多数派,通知leader commitch
if quorumMatchIndex > c.commitIndex && quorumMatchIndex >= c.startIndex {
c.commitIndex = quorumMatchIndex
asyncNotifyCh(c.commitCh)
}
...
}
原文地址:https://www.jianshu.com/p/66c1f68e8d63