MapReduce 笔记
以最简单的统计词频为例,我们只需要简单的写两个函数,就可以搭建起一个简单的服务集群
(1) Map和Reduce 函数
(2)MapReduceSpecification函数( 貌似有专门针对C++的函数库)
下面的这个连接是对谷歌《MapReduce: Simplified Data Processing on Large Clusters》论文的翻译
【2】MapReduce超大集群的简单数据处理
Java环境下对MapReduce的设置
【3】http://blog.csdn.net/xiaotom5/article/details/8074791
下面是统计词频的源代码
1 #include "mapreduce/mapreduce.h" 2 3 // User's map function 4 class WordCounter : public Mapper { 5 public: 6 virtual void Map(const MapInput& input) { 7 const string& text = input.value(); 8 const int n = text.size(); 9 for (int i = 0; i < n; ) { 10 // Skip past leading whitespace 11 while ((i < n) && isspace(text)) 12 i++; 13 14 // Find word end 15 int start = i; 16 while ((i < n) && !isspace(text)) 17 i++; 18 if (start < i) 19 Emit(text.substr(start,i-start),"1"); 20 } 21 } 22 }; 23 24 REGISTER_MAPPER(WordCounter); 25 26 // User's reduce function 27 class Adder : public Reducer { 28 virtual void Reduce(ReduceInput* input) { 29 // Iterate over all entries with the 30 // same key and add the values 31 int64 value = 0; 32 while (!input->done()) { 33 value += StringToInt(input->value()); 34 input->NextValue(); 35 } 36 37 // Emit sum for input->key() 38 Emit(IntToString(value)); 39 } 40 }; 41 42 REGISTER_REDUCER(Adder); 43 44 int main(int argc, char** argv) { 45 ParseCommandLineFlags(argc, argv); 46 47 MapReduceSpecification spec; 48 49 // Store list of input files into "spec" 50 for (int i = 1; i < argc; i++) { 51 MapReduceInput* input = spec.add_input(); 52 input->set_format("text"); 53 input->set_filepattern(argv); 54 input->set_mapper_class("WordCounter"); 55 } 56 57 // Specify the output files: 58 // /gfs/test/freq-00000-of-00100 59 // /gfs/test/freq-00001-of-00100 60 // 61 MapReduceOutput* out = spec.output(); 62 out->set_filebase("/gfs/test/freq"); 63 out->set_num_tasks(100); 64 out->set_format("text"); 65 out->set_reducer_class("Adder"); 66 67 // Optional: do partial sums within map 68 // tasks to save network bandwidth 69 out->set_combiner_class("Adder"); 70 71 // Tuning parameters: use at most 2000 72 // machines and 100 MB of memory per task 73 spec.set_machines(2000); 74 spec.set_map_megabytes(100); 75 spec.set_reduce_megabytes(100); 76 77 // Now run it 78 MapReduceResult result; 79 if (!MapReduce(spec, &result)) abort(); 80 81 // Done: 'result' structure contains info 82 // about counters, time taken, number of 83 // machines used, etc. 84 return 0; 85 } 86