Mit6.824 Lab2实现笔记

paper地址：http://nil.csail.mit.edu/6.824/2021/schedule.html

前言

建议在实现 Lab2 之前，配合 http://nil.csail.mit.edu/6.824/2021/labs/lab-raft.html 的 locking、structure 和 guide ，以及 Raft 论文进行使用。

开始

首先了解Raft论文核心知识：，也可以用以下代码进行总结Raft算法：

func (rf *Raft) runServer() {
	for {
		switch rf.status {
		// 心跳&日志复制
		case Leader:
                    AppendEntry()
		case Follower:
                // 1. Check HeartBeat
                // 2. Check if itself has voted
                // 3. After timeout, become a Condidate
		case Candidate:
			// 1. Ask other people to vote
		}
	}
}

Raft 服务

Raft结构


//
// A Go object implementing a single Raft peer.
//
type Raft struct {
	mu        sync.Mutex          // Lock to protect shared access to this peer's state
	peers     []*labrpc.ClientEnd // RPC end points of all peers
	persister *Persister          // Object to hold this peer's persisted state
	me        int                 // this peer's index into peers[]
	dead      int32               // set by Kill()

	// Your data here (2A, 2B, 2C).
	// Look at the paper's Figure 2 for a description of what
	// state a Raft server must maintain.

	state         RaftState     // 节点的角色
	appendentryCh chan *Entry   // 需要复制的日志条目
	heartBeat     time.Duration // 当前Leader的心跳时间
	electionTime  time.Time     // 当前Candidate的选举时间

	// Persistent on all server，所有服务器都需要持久化的信息
	currentTerm int // 当前任期
	votedFor    int // 当前正在投票给的那个节点id
	log         Log // 包好当前服务器所有日志条目

	// Volatile on all servers, 即所有服务器不需要持久化的信息
	commitIndex int // 已提交的最后一条日志条目的索引
	lastApplied int // 已应用到服务器本地的最后一条日志条目的索引

	// Volatile on Leader, 在leader中不需要持久化的状态信息
	nextIndex  []int // 发送到对应服务器的下一条日志条目的索引
	matchIndex []int // 已经复制到对应服务器的最高日志条目的索引

	// 为每个新提交的日志条目 发送一个ApplyMsg到Make()的applyCh通道参数。
	applyCh chan ApplyMsg
	// 条件变量，用于等待一个或一组goroutines满足条件后唤醒的场景,实现线程同步
	applyCond *sync.Cond

	// Lab2D Snapshot
	// last snapshot point index
	lastSSPointIndex int // 快照包含的最后日志条目索引
	lastSSPointTerm  int // 快照最后一个term
}

创建 Raft 节点

// 创建Raft实例
func Make(peers []*labrpc.ClientEnd, me int,
	persister *Persister, applyCh chan ApplyMsg) *Raft {
	rf := &Raft{}
	rf.peers = peers
	rf.persister = persister
	rf.me = me

	// Your initialization code here (2A, 2B, 2C).
	rf.state = Follower
	rf.currentTerm = 0
	rf.votedFor = -1
	rf.heartBeat = 50 * time.Millisecond
	rf.resetElectionTimer()

	// 初始化日志
	rf.log = makeEmptyLog()
	rf.log.append(Entry{-1, 0, 0})
	rf.commitIndex = 0
	rf.lastApplied = 0
	rf.nextIndex = make([]int, len(rf.peers))
	rf.matchIndex = make([]int, len(rf.peers))

	rf.applyCh = applyCh
	// 锁放到条件变量condition里
	rf.applyCond = sync.NewCond(&rf.mu)

	// initialize from state persisted before a crash
	rf.readPersist(persister.ReadRaftState())

	// 心跳&追加日志、选举事件定时器
	go rf.ticker()
	// 已提交的日志逐渐应用到状态机
	go rf.applier()
	return rf
}

定时器 timer，作用是计时并且按时间触发leader election或者append entry

ticker会以心跳为周期不断检查状态。心跳包是靠appendEntries()发送空log，如果log不为空就算发送日志到follower进行日志复制。

如果发现选举超时，这时候就会出发新一轮leader election。

// 心跳&追加日志、选举事件定时器
func (rf *Raft) ticker() {
	for rf.killed() == false {
		time.Sleep(rf.heartBeat)
		rf.mu.Lock()
		// 发送心跳
		if rf.state == Leader {
			rf.appendEntries(true)
		}
		// 选举超时, 则进行新的一轮选举
		if time.Now().After(rf.electionTime) {
			rf.leaderElection()
		}
		rf.mu.Unlock()
	}
}

接下来就是根据角色，不断进行选举超时检测、日志复制等过程

选举

leader 选举

启动新一轮leader election时，首先要将自己转为candidate状态，并且给自己投一票。然后向所有peer请求投票。

// 选举
func (rf *Raft) leaderElection() {
	rf.currentTerm++
	rf.state = Candidate
	rf.votedFor = rf.me
	rf.persist()
	rf.resetElectionTimer()
	term := rf.currentTerm
	// 票数
	voteCounter := 1
	lastLog := rf.log.lastLog()
	DPrintf("[%v]: start leader election, term %d\n", rf.me, rf.currentTerm)
	args := RequestVoteArgs{
		Term:         term,
		CandidateId:  rf.me,
		LastLogIndex: lastLog.Index,
		LastLogTerm:  lastLog.Term,
	}
	// sync.Onc 是在代码运行中需要的时候执行，且只执行一次
	var becomeLeader sync.Once
	for serverId, _ := range rf.peers {
		if serverId != rf.me {
			go rf.candidateRequestVote(serverId, &args, &voteCounter, &becomeLeader)
		}
	}
}

发送投票请求 candidateRequestVote

当candidate收到半数以上投票之后就可以进入leader状态，而这个状态转变会更新nextIndex[]和matchIndex[]，并且再成为leader之后要立刻发送一次心跳，不仅是为了重置选举超时计时器，还是为了告诉其他candidate，选举结束。


// candidate选举、发出投票的第一个入口
func (rf *Raft) candidateRequestVote(serverId int, args *RequestVoteArgs, voteCounter *int, becomeLeader *sync.Once) {
	DPrintf("[%d]: term %v send vote request to %d\n", rf.me, args.Term, serverId)
	reply := RequestVoteReply{}
	ok := rf.sendRequestVote(serverId, args, &reply)
	// RPC失败
	if !ok {
		return
	}
	rf.mu.Lock()
	defer rf.mu.Unlock()
	// 比candidate的term还大，则选举结束，变为Follower
	if reply.Term > args.Term {
		DPrintf("[%d]: %d 在新的term，更新term，结束\n", rf.me, serverId)
		rf.setNewTerm(reply.Term)
		return
	}

	// term比Follower小，说明此投票已经失效，选举退出
	if reply.Term < args.Term {
		DPrintf("[%d]: %d 的term %d 已经失效，结束\n", rf.me, serverId, reply.Term)
		return
	}

	// 没得到选票
	if !reply.VoteGranted {
		DPrintf("[%d]: %d 没有投给me，结束\n", rf.me, serverId)
		return
	}
	DPrintf("[%d]: from %d term一致，且投给%d\n", rf.me, serverId, rf.me)

	*voteCounter++
	// 票数超过一半，并且term没变，即获得多数选票
	if *voteCounter > len(rf.peers)/2 &&
		rf.currentTerm == args.Term && rf.state == Candidate {
		DPrintf("[%d]: 获得多数选票，可以提前结束\n", rf.me)
		becomeLeader.Do(func() {
			DPrintf("[%d]: 当前term %d 结束\n", rf.me, rf.currentTerm)
			rf.state = Leader
			lastLogIndex := rf.log.lastLog().Index
			// 初始化nextIndex和matchIndex数组
			for i, _ := range rf.peers {
				rf.nextIndex[i] = lastLogIndex + 1
				rf.matchIndex[i] = 0
			}
			DPrintf("[%d]: leader - nextIndex %#v", rf.me, rf.nextIndex)
			// 发送心跳
			rf.appendEntries(true)
		})
	}
}

投票机制实现 RequestVote

//
// example RequestVote RPC handler.
// 实现选举机制
func (rf *Raft) RequestVote(args *RequestVoteArgs, reply *RequestVoteReply) {
	// Your code here (2A, 2B).
	rf.mu.Lock()
	defer rf.mu.Unlock()

	// all servers rule 2：
	// 如果RPC请求或响应包含任期 T > currentTerm：设置currentTerm = T，转换为follower
	// term比candidate的小，则更新当前Follower的term
	if args.Term > rf.currentTerm {
		rf.setNewTerm(args.Term)
	}

	// candidate的term更小小，则选举失败
	if args.Term < rf.currentTerm {
		reply.Term = rf.currentTerm
		reply.VoteGranted = false
		return
	}
	// 当前服务器的最后一条日志条目
	myLastLog := rf.log.lastLog()
	// 选举是否过期，即candidate的lastLogTerm需要大于Follower的lastLogTerm，
	// 或者满足：lastLogTerm相等，同时Candidate的lastLogIndex大于等于Follower的LastLogIndex
	// 通俗来说，就是Candidate的term要比Follower的大，term相等则index要比Follower的大，才投票
	upToDate := args.LastLogTerm > myLastLog.Term ||
		(args.LastLogTerm == myLastLog.Term && args.LastLogIndex >= myLastLog.Index)

	// 满足条件，可以投票
	if (rf.votedFor == -1 || rf.votedFor == args.CandidateId) && upToDate {
		reply.VoteGranted = true
		rf.votedFor = args.CandidateId
		rf.persist()
		rf.resetElectionTimer()
		DPrintf("[%v]: term %v vote %v", rf.me, rf.currentTerm, rf.votedFor)
	} else {
		reply.VoteGranted = false
	}
	// 投票成功后，记录Follower的term
	reply.Term = rf.currentTerm
}

日志复制

appendEntries

完成选举之后，leader会立刻触发一次心跳包，随后在每个心跳周期发送心跳包，来阻止新一轮leader election。
每个心跳周期，发送一次AppendEntries RPC，当这个RPC不包含log时，这个包被称为心跳包。

// 向Follower发送追加日志
func (rf *Raft) appendEntries(heartBeat bool) {
	lastLog := rf.log.lastLog()
	for peer, _ := range rf.peers {
		if peer == rf.me {
			// 重启选举计时器
			rf.resetElectionTimer()
			continue
		}

		// Leaders role3
		//  如果Leader的lastLogIndex大于跟随者的最后一个日志索引 ≥ nextIndex
		// 则发送AppendEntries RPC包含从nextIndex开始的日志条目
		if lastLog.Index >= rf.nextIndex[peer] || heartBeat {
			nextIndex := rf.nextIndex[peer]
			if nextIndex <= 0 {
				nextIndex = 1
			}
			// Follower的日志需要与Leader同步
			if lastLog.Index+1 < nextIndex {
				nextIndex = lastLog.Index
			}
			// 则发送AppendEntries RPC包含从nextIndex开始的日志条目
			prevLog := rf.log.at(nextIndex - 1)
			args := AppendEntriesArgs{
				Term:         rf.currentTerm,
				LeaderId:     rf.me,
				PrevLogIndex: prevLog.Index,
				PrevLogTerm:  prevLog.Term,
				Entries:      make([]Entry, lastLog.Index-nextIndex+1),
				LeaderCommit: rf.commitIndex,
			}
			// 拷贝要追加的日志条目数组到参数
			copy(args.Entries, rf.log.slice(nextIndex))
			// Leader并行地向peers发送追加日志的命令
			go rf.leaderSendEntries(peer, &args)
		}
	}
}

leaderSendEntries


// Leader向peers发送追加日志的命令
func (rf *Raft) leaderSendEntries(serverId int, args *AppendEntriesArgs) {
	var reply AppendEntriesReply
	ok := rf.sendAppendEntries(serverId, args, &reply)
	// 发送追加日志命令异常
	if !ok {
		return
	}
	// 发送追加日志命令成功
	rf.mu.Lock()
	defer rf.mu.Unlock()
	// 比当前Leader的term还大，term异常
	if reply.Term > rf.currentTerm {
		rf.setNewTerm(reply.Term)
		return
	}
	// term正常
	if args.Term == rf.currentTerm {
		// rules for leader 3.1, 更新matchIndex和nextIndex
		// 复制日志成功
		if reply.Success {
			// matchIndex: 已知复制到该服务器的最高日志条目索引
			match := args.PrevLogIndex + len(args.Entries)
			// nextIndex
			next := match + 1
			// 更新Follower的nextIndex和matchIndex
			rf.nextIndex[serverId] = max(rf.nextIndex[serverId], next)
			rf.matchIndex[serverId] = max(rf.matchIndex[serverId], match)
			DPrintf("[%v]: %v append success next %v match %v", rf.me, serverId, rf.nextIndex[serverId], rf.matchIndex[serverId])
		} else if reply.Conflict {
			DPrintf("[%v]: Conflict from %v %#v", rf.me, serverId, reply)
			// Follower.lastLogIndex < PrevLogIndex
			if reply.XTerm == -1 {
				// 日志缺失，nextIndex设置为Follower的日志条目数量
				rf.nextIndex[serverId] = reply.XLen
			} else {
				// Follower.log.at(args.PrevLogIndex).Term != Leader.PrevLogTerm
				// 即Follower的日志条目中某条日志的prevLogIndex对应的prevLogTerm不一样
				// reply.XTerm为Follower.log[PrevLogIndex].Term
				// Leader找到自己这个Term对应的最后一条日志条目索引
				lastIndexOfXTerm := rf.findLastLogInTerm(reply.XTerm)
				DPrintf("[%v]: lastLogInXTerm %v", rf.me, lastIndexOfXTerm)
				if lastIndexOfXTerm > 0 {
					// 找得到，则直接复制为nextIndex
					rf.nextIndex[serverId] = lastIndexOfXTerm
				} else {
					// Leader日志中不存在这个term，则设置为Follower这个term的第一个日志条目索引
					rf.nextIndex[serverId] = reply.XIndex
				}
			}
			DPrintf("[%v]: leader nextIndex[%v] %v", rf.me, serverId, rf.nextIndex[serverId])
		} else if rf.nextIndex[serverId] > 1 {
			// 如果AppendEntries因为日志不一致而失败：递减NextIndex并重试
			rf.nextIndex[serverId]--
		}
		rf.leaderCommitRule()
	}
}

AppendEntries RPC


// RPC方法
// Follower执行接受Leader追加日志的命令。并返回Follower自己的信息
func (rf *Raft) AppendEntries(args *AppendEntriesArgs, reply *AppendEntriesReply) {
	rf.mu.Lock()
	defer rf.mu.Unlock()
	DPrintf("[%d]: (term %d) follower received [%v] AppendEntries %v, prevIndex %v, prevLogTerm %v\n",
		rf.me, rf.currentTerm, args.LeaderId, args.Entries, args.PrevLogIndex, args.PrevLogTerm)
	// all servers rule 2:
	// 如果RPC请求或响应包含任期 T > currentTerm：设置currentTerm = T，转换为follower
	reply.Success = false
	reply.Term = rf.currentTerm
	// 更新最新的term并返回
	if args.Term > rf.currentTerm {
		rf.setNewTerm(args.Term)
		return
	}

	// AppendEntries rpc rule 1:
	// 如果 term < currentTerm，则返回 false
	// Leader的term比Follower的term还小，则直接返回
	if args.Term < rf.currentTerm {
		return
	}
	// setNewTerm成功
	rf.resetElectionTimer()

	// candidate rule 3
	if rf.state == Candidate {
		rf.state = Follower
	}

	// AppendEntries rpc rule 2:
	// 如果日志在prevLogIndex处不包含term与prevLogTerm匹配的条目，则返回false
	// Follower的lastLogIndex小于Leader的prevLogIndex, 不符合追加日志的情况, 即缺失日志
	if rf.log.lastLog().Index < args.PrevLogIndex {
		reply.Conflict = true
		reply.XTerm = -1
		reply.XIndex = -1
		reply.XLen = rf.log.len()
		DPrintf("[%v]: Conflict XTerm %v, XIndex %v, XLen %v", rf.me, reply.XTerm, reply.XIndex, reply.XLen)
		return
	}

	// prevLogIndex对应的prevLogTerm不一样，有冲突，返回false
	if rf.log.at(args.PrevLogIndex).Term != args.PrevLogTerm {
		reply.Conflict = true
		// prevLogIndex对应的term，preLogIndex之前的日志条目与Leader的相同
		xTerm := rf.log.at(args.PrevLogIndex).Term
		// 借助xTerm，从preLogIndex开始，从右到左寻找第一个term不等于xTerm的日志条目索引xIndex+1
		// 此时reply的XIndex为：任期为xTerm的日志条目中第一个logIndex
		for xIndex := args.PrevLogIndex; xIndex > 0; xIndex-- {
			if rf.log.at(xIndex-1).Term != xTerm {
				reply.XIndex = xIndex
				break
			}
		}
		// 此时reply的XIerm为：prevLogIndex对应的term
		reply.XTerm = xTerm
		// 状态机内的日志条目数量
		reply.XLen = rf.log.len()
		DPrintf("[%v]: Conflict XTerm %v, XIndex %v, XLen %v", rf.me, reply.XTerm, reply.XIndex, reply.XLen)
		return
	}

	// Follower日志中存在idx和term分别为prevLogIndexh和prevLogTerm的日志条目
	for idx, entry := range args.Entries {
		// append entries rpc 3
		// 删除和新日志条目发生冲突（索引相同，任期不同）的日志条目
		if entry.Index <= rf.log.lastLog().Index && rf.log.at(entry.Index).Term != entry.Term {
			// 相当于删除entry.Index开始的后续日志条目
			rf.log.truncate(entry.Index)
			rf.persist()
		}
		// append entries rpc 4
		// 开始追加新日志
		if entry.Index > rf.log.lastLog().Index {
			rf.log.append(args.Entries[idx:]...)
			DPrintf("[%d]: follower append [%v]", rf.me, args.Entries[idx:])
			rf.persist()
			break
		}
	}

	// append entries rpc 5
	// Follower追加日志后，更新commitIndex，并且开始应用日志到状态机
	if args.LeaderCommit > rf.commitIndex {
		rf.commitIndex = min(args.LeaderCommit, rf.log.lastLog().Index)
		rf.apply()
	}
	reply.Success = true
}

完成AppendEntry RPC之后，Leader需要提交已有的日志条目


// Leader复制日志到Follower超过一半后，就可以提交日志
func (rf *Raft) leaderCommitRule() {

	if rf.state != Leader {
		return
	}
	/**
	leader rule 4：
	如果存在一个N，使得N>commitIndex，大多数的matchIndex[i]≥N，
	并且log[N].term == currentTerm：设置commitIndex = N
	*/
	// 每次以一个日志条目为粒度，更新commitIndex
	for i := rf.commitIndex + 1; i <= rf.log.lastLog().Index; i++ {
		if rf.log.at(i).Term != rf.currentTerm {
			continue
		}
		counter := 1
		// 根据matchIndex判断是否复制成功
		for serverId := 0; serverId < len(rf.peers); serverId++ {
			// 通过计算所有Follower中，matchIndex > Leader.commitIndex，则计数
			if serverId != rf.me && rf.matchIndex[serverId] >= i {
				counter++
			}
			// 超过一半，标识复制成功，更新commitIndex
			if counter > len(rf.peers)/2 {
				rf.commitIndex = i
				DPrintf("[%v] leader尝试提交 index %v", rf.me, rf.commitIndex)
				rf.apply()
				break
			}
		}
	}
}

日志应用到状态机

// 应用日志到本地
func (rf *Raft) apply() {
	// 放行线程
	rf.applyCond.Broadcast()
	DPrintf("[%v]: rf.applyCond.Broadcast()", rf.me)
}

// 应用日志到状态机
func (rf *Raft) applier() {
	rf.mu.Lock()
	defer rf.mu.Unlock()
	// raft实例还在
	for !rf.killed() {
		// 开始应用日志
		if rf.commitIndex > rf.lastApplied &&
			rf.log.lastLog().Index > rf.lastApplied {
			rf.lastApplied++
			applyMsg := ApplyMsg{
				CommandValid: true,
				Command:      rf.log.at(rf.lastApplied).Command,
				CommandIndex: rf.lastApplied,
			}
			DPrintVerbose("[%v]: COMMIT %d: %v", rf.me, rf.lastApplied, rf.commits())
			rf.mu.Unlock()
			// 保存每一条日志条目的命令、索引
			rf.applyCh <- applyMsg
			rf.mu.Lock()
		} else {
			// 等到执行apply方法才放行线程
			rf.applyCond.Wait()
			DPrintf("[%v]: rf.applyCond.Wait()", rf.me)
		}
	}
}

客户端向Raft服务发送命令

最后是start函数，它会接受客户端的command，并且应用raft算法。

// 每个服务器开始处理并将命令附加到复制的日志中
// 分别返回：要复制的日志条目中最后一条日志条目的索引；当前Raft的任期；是否成功
func (rf *Raft) Start(command interface{}) (int, int, bool) {
	// Your code here (2B).
	rf.mu.Lock()
	defer rf.mu.Unlock()
	// Leader才能执行Start函数
	if rf.state != Leader {
		return -1, rf.currentTerm, false
	}
	index := rf.log.lastLog().Index + 1
	term := rf.currentTerm
	log := Entry{
		Command: command,
		Term:    term,
		Index:   index,
	}
	rf.log.append(log)
	rf.persist()
	// debug
	DPrintf("[%v]: term %v Start %v", rf.me, term, log)
	// 向Follower发送日志
	rf.appendEntries(false)
	return index, term, true
}

posted @ 2022-03-05 17:32 JavaJayV 阅读(219) 评论(0) 收藏举报

刷新页面返回顶部

JavaJayV

Mit6.824 Lab2实现笔记

前言

开始

Raft 服务

选举

日志复制

公告