MIT6.824 实验一 MapReduce
前言
先贴个图,所有case全过了!做了好几天全pass了还是挺有成就感的
MIT6.824配套实验中要求:
多个输入文件,每个文件在map阶段需要拆分成nReduce个输出文件,这里会产生nRecuce * len(files)个中间文件,然后nReduce个函数中,将nRecuce * len(files)个中间文件合并到nRecuce个最终文件中(这里比较巧妙的点在于对于每一个单词,其进行hash对nReduce得到的值是一样的,所以我们可以通过标记hash值将多个不同文件中相同的key读取写入到最终文件中)
具体过程如下图:
- 假设我们有8个输入文件,对应8个Map task,nReduce为3
- 对于8个输入文件,在Map阶段每个文件(任务)生成3个中间文件,总共24个文件
- 在3个reduce task中,每个task分别会读取格式为mr--0、mr--1、mr-*-2的文件,统计每个单词的结果,分别输出到mr-out-0、mr-out-1、mr-out-2三个输出文件中
这里可能大家会有一个疑问,怎么能够保证8个不同文件的相同单词会被统计到同一个输出文件中呢?
这里只需要我们在对每个map task产生reduce task需要的中间文件时,对每个单词进行hash%nReduce,就可以将相同的词都写入到mr-*-X(0...1...2)文件中
数据结构设计
实验中有提到和论文中讲的略有不同,在实验中我们需要work向master发送获取task请求,当work处理task成功后需要告诉master:我处理task:ID 成功了
请求task
type GetTaskArgs struct{}
type GetTaskReply struct {
TaskNumber int
NReduce int
NMap int
FileName string
Phrase Phrase
}
上报task的状态
type ReportArgs struct {
Succeed bool
Phrase Phrase
Index int
}
type ReportReply struct{}
task定义如下:
type Phrase string
type Status string
const (
PhraseMap Phrase = "map"
PhraseReduce Phrase = "reduce"
PhraseDone Phrase = "done"
StatusReady Status = "ready"
StatusRunning Status = "running"
StatusDone Status = "done"
timeout = time.Second * 10
)
type Task struct {
FileName string
TaskNumber int
Phrase Phrase
Status Status
}
Master中的数据结构定义如下:
type Master struct {
// Your definitions here.
Phrase Phrase
MapTasks []*Task
ReduceTasks []*Task
Lock *sync.Mutex
}
实现逻辑
Master初始化#
func MakeMaster(files []string, nReduce int) *Master {
m := Master{}
// Your code here.
m.Phrase = PhraseMap
m.MapTasks = make([]*Task, len(files))
m.ReduceTasks = make([]*Task, nReduce)
m.Lock = &sync.Mutex{}
for i := 0; i < len(files); i++ {
m.MapTasks[i] = &Task{
TaskNumber: i,
FileName: files[i],
Phrase: PhraseMap,
Status: StatusReady,
}
}
for j := 0; j < nReduce; j++ {
m.ReduceTasks[j] = &Task{
Phrase: PhraseReduce,
Status: StatusReady,
TaskNumber: j,
}
}
m.server()
return &m
}
Master提供GetTask功能#
func (m *Master) GetTask(args *GetTaskArgs, reply *GetTaskReply) error {
m.Lock.Lock()
defer m.Lock.Unlock()
var task *Task
if m.Phrase == PhraseMap {
task = m.getTaskByPhrase(PhraseMap)
} else if m.Phrase == PhraseReduce {
task = m.getTaskByPhrase(PhraseReduce)
}
if task == nil {
return errors.New("get nil tasks")
}
*reply = GetTaskReply{
TaskNumber: task.TaskNumber,
NMap: len(m.MapTasks),
NReduce: len(m.ReduceTasks),
FileName: task.FileName,
Phrase: task.Phrase,
}
return nil
}
func (m *Master) getTaskByPhrase(phrase Phrase) *Task {
var tasks []*Task
if phrase == PhraseMap {
tasks = m.MapTasks
} else if phrase == PhraseReduce {
tasks = m.ReduceTasks
}
for _, task := range tasks {
if task.Status != StatusReady {
continue
}
task.Status = StatusRunning
ctx, _ := context.WithTimeout(context.Background(), timeout)
go func() {
select {
case <-ctx.Done():
m.Lock.Lock()
defer m.Lock.Unlock()
if task.Status != StatusDone {
task.Status = StatusReady
}
}
}()
return task
}
return nil
}
Master 会通过获取Worker的执行结果来更新状态#
func (m *Master) Report(args *ReportArgs, reply *ReportReply) error {
m.Lock.Lock()
defer m.Lock.Unlock()
if args.Phrase == PhraseMap {
m.MapTasks[args.Index].Status = StatusDone
} else if args.Phrase == PhraseReduce {
m.ReduceTasks[args.Index].Status = StatusDone
}
if allTasksDone(m.MapTasks) {
m.Phrase = PhraseReduce
}
if allTasksDone(m.ReduceTasks) {
m.Phrase = PhraseDone
}
return nil
}
func allTasksDone(tasks []*Task) bool {
for _, task := range tasks {
if task.Status != StatusDone {
return false
}
}
return true
}
Master判断任务执行成功#
func (m *Master) Done() bool {
m.Lock.Lock()
defer m.Lock.Unlock()
// Your code here.
return m.Phrase == PhraseDone
}
Worker执行流程#
func Worker(mapf func(string, string) []KeyValue,
reducef func(string, []string) string) {
// Your worker implementation here.
// uncomment to send the Example RPC to the master.
// CallExample()
var (
args = &GetTaskArgs{}
reportReply = &ReportReply{}
)
for {
reply := new(GetTaskReply)
ok := call("Master.GetTask", args, reply)
if !ok {
time.Sleep(time.Second * 10)
continue
}
var err error
switch reply.Phrase {
case PhraseMap:
err = doMap(mapf, reply.FileName, reply.TaskNumber, reply.NReduce)
case PhraseReduce:
err = doReduce(reducef, reply.FileName, reply.TaskNumber, reply.NMap)
case PhraseDone:
log.Println("map-reduce task done")
}
if err == nil {
if ok := call("Master.Report", &ReportArgs{
Succeed: true,
Phrase: reply.Phrase,
Index: reply.TaskNumber,
}, reportReply); !ok {
log.Println("failed call report")
}
}
}
}
Map task处理#
// map过程是对指定文件中的每一个词进行统计
func doMap(mapf func(string, string) []KeyValue,fileName string,mapIndex,nReduce int) error {
file,err := os.Open(fileName)
defer file.Close()
if err != nil{
log.Printf("can not open %s",fileName)
return err
}
content,err := ioutil.ReadAll(file)
if err != nil{
log.Printf("can not read %s",fileName)
return err
}
kva := mapf(fileName,string(content))
//总共产生nReduce个文件,给reduce用,mapIndex指的是第n个文件
for i := 0; i < nReduce;i++{
file,err := ioutil.TempFile("","*.txt")
//fmt.Println("filename:",file.Name())
interMediateFileName := fmt.Sprintf("mr-%s-%s",strconv.Itoa(mapIndex),strconv.Itoa(i)+".txt")
//file,err := os.Create(interMediateFileName)
defer file.Close()
if err != nil{
log.Fatalf("failed create file err: %s", err.Error())
}
enc := json.NewEncoder(file)
//遍历每一个词,写到指定的文件里
for _, kv := range kva{
if ihash(kv.Key) % nReduce == i{
enc.Encode(&kv)
}
}
if err := os.Rename(file.Name(),interMediateFileName);err != nil{
log.Fatal("falild:",err)
}
}
return nil
}
Reduce task处理#
func doReduce(reducef func(string, []string) string,intermediateFileName string,reduceTaskIndex,nMap int) error {
res := make(map[string][]string)
for i := 0;i < nMap;i++{
intermediateFileName = fmt.Sprintf("mr-%s-%s",strconv.Itoa(i),strconv.Itoa(reduceTaskIndex)+".txt")
// 打开中间文件
file, err := os.Open(intermediateFileName)
defer file.Close()
if err != nil {
log.Fatalf("cannot open %v", intermediateFileName)
//return err
}
// 反序列化JSON格式文件
dec := json.NewDecoder(file)
// 读取文件内容
for {
var kv KeyValue
err := dec.Decode(&kv)
if err != nil {
//log.Print("read file done...")
break
}
_, ok := res[kv.Key]
if !ok {
res[kv.Key] = make([]string, 0)
}
res[kv.Key] = append(res[kv.Key], kv.Value)
}
}
// 提取key值,用于排序
var keys []string
for k := range res {
keys = append(keys, k)
}
// key值排序
sort.Strings(keys)
outputFileName := fmt.Sprintf("mr-out-" + strconv.Itoa(reduceTaskIndex)+".txt")
outputFile,err := ioutil.TempFile("","*.txt")
//outputFile, err := os.Create(outputFileName)
if err != nil{
fmt.Println("failed create outputFile",err.Error())
log.Fatal("failed create outputFile",err.Error())
}
for _, k := range keys {
output := reducef(k, res[k])
// 输出reduce的结果到mr-out-X文件中
fmt.Fprintf(outputFile, "%v %v\n", k, output)
}
os.Rename(outputFile.Name(),outputFileName)
outputFile.Close()
return nil
}
作者:Esofar
出处:https://www.cnblogs.com/wanber/p/16267383.html
版权:本作品采用「署名-非商业性使用-相同方式共享 4.0 国际」许可协议进行许可。
Buy me a cup of coffee ☕.
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 单线程的Redis速度为什么快?
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 展开说说关于C#中ORM框架的用法!