mit 6.824 lab2B，raft日志复制（lab2D中有关于此处大量代码修改找出了很多错误）

lab2 说明：

https://pdos.csail.mit.edu/6.824/labs/lab-raft.html

参考博客：

https://zhuanlan.zhihu.com/p/514512060

https://blog.csdn.net/weixin_45938441/article/details/124797074?spm=1001.2014.3001.5502

实现内容：

实现领导者和追随者代码以附加新的日志条目。

主要函数 FOLLOWER ：AppendEntries 心跳和数据同步

　　　　　LEADER: start 添加条目、心跳发送及 response 相关函数

　　　　　所有服务器：apply 条目提交

理解 test 的目的对于调试很重要：

上图摘自csdn博客

日志复制流程

　　1、客户端通过start函数向Leader提交数据。

　　2、Leader 根据nextIndex数组发送RPC请求调用每个FOLLOWER的AppendEntries 复制条目。

　　3、FOLLOWER处理条目实现mit6.824优化，返回reply。

　　4、Leader 处理reply ，计算每个FOLLOWER新的nextIndex 和 matchIndex

　　5、Leader 根据matchIndex 计算LeaderCommit

　　6、Leader 提交条目。

　　7、Leader 发送心跳包含LeaderCommit，FOLLOWER提交条目。

start函数要注意使用锁，保证提交顺序

requestAppends 函数是发送RPC请求， false 代表不是心跳。

unc (rf *Raft) Start (command interface{}) (int, int, bool) {
	rf.mu.Lock()
	defer rf.mu.Unlock()
	index := -1
	term := -1
	isLeader := true

	// Your code here (2B).
	if rf.status != Leader {
		return index, term, false
	}

	term = rf.currentTerm
	index = rf.start(command)
	isLeader = true

	DDPrintf("s:%v  t:%v  写入数据:%v",rf.me,rf.currentTerm,command)
	rf.requestAppends(false)
	return index, term, isLeader
}

//添加请求到log
//返回index
func (rf *Raft) start (command interface{}) int {
	new := Entry {
		Value: command,
		Term:  rf.currentTerm,
		Index: 0,
	}

	return rf.log.append(new)
}

Entries包含next位置及其后面所有条目，基本都是按论文里实现的。

func (rf *Raft) appendEntrier (isHeart bool,peer int) {
	next := rf.nextIndex[peer]
	if next < rf.log.Index0 {
		//next最小为快照位置
		next = rf.log.Index0 + 1
	}

	args := RequestAppendArgs {
		Term:         rf.currentTerm,
		Leaderld:     rf.me,
		PrelogIndex:  next - 1,
		PrelogTerm:   rf.log.getLogITerm(next - 1),
		Entries:      make([]Entry,rf.log.lastlogIndex() - next + 1),
		LeaderCommit: rf.commitIndex,
	}

	copy(args.Entries,rf.log.slice(next))
	if isHeart {
		DDPrintf("s:%v t:%v  发送心跳到 s：%v  value: %v",rf.me,rf.currentTerm,peer,args.Entries)
	}else {
		DDPrintf("s:%v t:%v  发送数据到 s：%v  value：%v",rf.me,rf.currentTerm,peer,args.Entries)
	}
	go func() {
		var reply ResponseAppendArgs
		ok := rf.sendAppendEntries(peer,&args,&reply)
		if ok {
			rf.mu.Lock()
			defer rf.mu.Unlock()
			rf.appendSubsequentTreatmentL(peer,args,reply)
		}
	}()
}

这里看FOLLOWER处理 AppendEnies

代码的作用

　　① 非Follow 收到心跳可能是有一个服务器断线重连，不管接受者服务器是不是leader只要收到AppendEntries 就切换为follower 并且集群重新开始选举出合适的。

　　② 注意重复写入问题，同样的AppendEntries 可能由于网络原因重复接收到，所以要进行重复写入判断，我这里的判断代码非常粗暴。

　　③ 剩下几段代码是根据mit 6.824 快速恢复小节实现的优化

　　④ 追加数据后通过Cond 信号进行提交

//心跳、追加日志
func (rf *Raft) AppendEntries (args *RequestAppendArgs, reply *ResponseAppendArgs) {

	rf.mu.Lock()
	defer rf.mu.Unlock()

	if rf.currentTerm > args.Term {
		reply.Term = rf.currentTerm
		return
	}

	//非follow 收到心跳
	if rf.status != Followers && rf.currentTerm <= args.Term {
		rf.newTerm(args.Term)
	}

	rf.setElectionTime()


	//重复写入
	Elenght := len(args.Entries)
	if Elenght != 0 && rf.log.lastlogIndex() == args.Entries[Elenght - 1].Index && rf.log.lastLogTerm() == args.Entries[Elenght - 1].Term{
		reply.Success = true
		return
	}


	//follow prelogIndex缺失
	if rf.log.lastlogIndex() < args.PrelogIndex {
		reply.Term = rf.currentTerm
		reply.XLen = args.PrelogIndex - rf.log.lastlogIndex()
		reply.XTerm = -1
		reply.XIndex = rf.log.lastlogIndex()
		reply.Success = false
		DDPrintf("s:%v t:%v  缺失数据 entries:%v     来自s：%v t：%v entries:%v",rf.me,rf.currentTerm,rf.log.Entrys,args.Leaderld,args.Term,args.Entries)
		return
	}

	//prelogIndex处term不匹配
	if args.PrelogTerm != rf.log.getLogITerm(args.PrelogIndex) {
		reply.Success = false
		reply.Term = rf.currentTerm
		reply.XIndex = rf.findStartTermIndex(args.PrelogIndex)
		reply.XTerm = rf.log.getLogITerm(args.PrelogIndex)
		rf.log.delSlice(reply.XIndex)
		DDPrintf("s:%v t:%v  不匹配 entries:%v     来自s：%v t：%v entries:%v   prelogIndex：%v",rf.me,rf.currentTerm,rf.log.Entrys,args.Leaderld,args.Term,args.Entries,args.PrelogIndex)
		return
	}

	//追加
	if len(args.Entries) == 0 {
		reply.Success = true
		if rf.commitIndex < args.LeaderCommit {
			rf.commitIndex = args.LeaderCommit
		}
		rf.applyCond.Broadcast()
		DDPrintf("s:%v term:%v 接收心跳  来自 s:%v  term:%v\n", rf.me, rf.currentTerm, args.Leaderld, args.Term)
		return
	}

	rf.log.theIndexAppend(args.PrelogIndex+1, args.Entries)
	DDPrintf("s:%v   追加数据 data:%v",rf.me,args.Entries)
	reply.Success = true

	rf.commitIndex = rf.min(args.LeaderCommit,args.Entries[Elenght - 1].Index)
	rf.applyCond.Broadcast()
}

①根据响应结果计算新的nextIndex 和matchIndex

②根据matchIndex计算leaderCommit

③leaderCommit 可能有变化所以提交一次条目

//leader发送心跳响应的后续处理
func (rf *Raft) appendSubsequentTreatmentL (peer int,args RequestAppendArgs,reply ResponseAppendArgs) {
	if reply.Term > rf.currentTerm {
		rf.newTerm(reply.Term)
		return
		//当前服务器没有从leader变为follow
	}  else if rf.currentTerm == args.Term {
		rf.appendEntriesResponse(peer,args,reply)
	}
}

func (rf *Raft) appendEntriesResponse (peer int,args RequestAppendArgs , reply ResponseAppendArgs ) {

	if !reply.Success {
		//缺失
		if reply.XTerm == -1 {
			rf.nextIndex[peer] -= reply.XLen
			return
		}
		//leader不存在term
		if i := rf.findlastTermIndex(args.PrelogIndex,reply.Term) ; i == -1 {
			rf.nextIndex[peer] = reply.XIndex
			return
		} else {
			//leader 与 follow不同步
			rf.nextIndex[peer] = i
			return
		}
	}

	if len(args.Entries) == 0 {  //心跳不用更新
		return
	}

	newnext := args.PrelogIndex + len(args.Entries) + 1
	newmatch := args.PrelogIndex + len(args.Entries)

	if newnext > rf.nextIndex[peer] {
		rf.nextIndex[peer] = newnext
	}
	if newmatch > rf.matchIndex[peer] {
		rf.matchIndex[peer] = newmatch
	}

	rf.inspectCommItIndex()
}

//检查leader commitIndex并重设
//如果存在一个满足N > commitIndex的 N，并且大多数的matchIndex[i] ≥ N成立，并且log[N].term == currentTerm成立，那么令 commitIndex = N （5.3 和 5.4 节）
func (rf *Raft)inspectCommItIndex() {
	for peer,oneMatchIndex := range rf.matchIndex {
		if peer == rf.me {
			continue
		}

		if oneMatchIndex > rf.commitIndex {
			num := 1
			for PEER,otherMatchIndex := range rf.matchIndex {
				if PEER == rf.me || peer == PEER {
					continue
				}

				if oneMatchIndex <= otherMatchIndex {
					num += 1
				}
			}

			if num >= len(rf.matchIndex) / 2 {
				rf.commitIndex = oneMatchIndex
				rf.matchIndex[rf.me] = oneMatchIndex
			}

		}
	}
	rf.applyCond.Broadcast()
}

条目提交

使用管道可能有死锁，这是mit6.824中提到的，要在管道处取消锁。

func (rf *Raft) kvServerLayer () {
	rf.mu.Lock()
	defer rf.mu.Unlock()
	for {
		if rf.commitIndex >= rf.lastApplied + 1 &&
			rf.lastApplied + 1 <= rf.log.lastlogIndex() &&
			rf.lastApplied + 1 > rf.log.start(){
			rf.lastApplied ++
			nMsg := ApplyMsg{
				CommandValid:  true,
				Command:       rf.log.Entrys[rf.lastApplied].Value,
				CommandIndex:  rf.log.Entrys[rf.lastApplied].Index,
				SnapshotValid: false,
				Snapshot:      nil,
				SnapshotTerm:  0,
				SnapshotIndex: 0,
			}
			rf.mu.Unlock()
			rf.ApplyC <- nMsg
			rf.mu.Lock()
		} else {
			rf.applyCond.Wait()
		}
	}
}

做的时候要多分析log，通过log可以看出很多问题，这些测试可以一个一个做。

posted on 2022-09-13 16:59 thotf 阅读(452) 评论(1) 收藏举报

刷新页面返回顶部

thotf

mit 6.824 lab2B，raft日志复制（lab2D中有关于此处大量代码修改找出了很多错误）

导航

公告