Hadoop 中的 MapReduce链接作业之预处理和后处理阶段的链接
package com.test; import java.io.IOException; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.KeyValueTextInputFormat; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.lib.ChainMapper; import org.apache.hadoop.mapred.lib.ChainReducer; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; /** * MapReduce链接作业之预处理和后处理阶段的链接 * * @author Administrator * */ public class MyJobLink extends Configured implements Tool { public static class Reduce extends MapReduceBase implements Reducer<LongWritable, Text, Text, Text> { public void reduce(LongWritable key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { /** * nothing */ output.collect(new Text("1"), new Text("1")); } } public static class Map1 extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> { public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { /** * nothing */ output.collect(value, new Text(key.toString())); } } public static class Map2 extends MapReduceBase implements Mapper<Text, Text, LongWritable, Text> { public void map(Text key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException { /** * nothing */ output.collect(new LongWritable(Long.valueOf(value.toString())), key); } } public static class Map3 extends MapReduceBase implements Mapper<Text, Text, LongWritable, Text> { public void map(Text key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException { /** * nothing */ output.collect(new LongWritable(Long.valueOf("1")), key); } } public static class Map4 extends MapReduceBase implements Mapper<LongWritable, Text, LongWritable, Text> { public void map(LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException { /** * nothing */ output.collect(new LongWritable(Long.valueOf("1")), new Text("1")); } } @Override public int run(String[] args) throws Exception { Configuration conf = getConf(); JobConf job = new JobConf(conf); job.setJobName("ChainJob"); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); Path in = new Path(args[0]); Path out = new Path(args[1]); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); /** * 在作业中添加Map1阶段 * 使用ChainMapper.addMapper()添加位于Reduce之前的所有步骤 * * ChainMapper.addMapper(JobConf job, * Class<? extends Mapper<LongWritable, Text, Text, Text>> klass, * Class<? extends LongWritable> inputKeyClass, * Class<? extends Text> inputValueClass, * Class<? extends Text> outputKeyClass, * Class<? extends Text> outputValueClass, * boolean byValue, * JobConf mapperConf) * 该方法有8个参数,第一个和最后一个分别为全局和本地的JobConf对象. * 第二个参数(klass)是Mapper类,负责数据处理. * 余下4个参数inputKeyClass,inputValueClass, outputKeyClass, outputValueClass * 是这个Mapper类中的输入/输出类的类型 * * 稍微解释一下byValue这个参数.在标准的Mapper模型中, * 键/值对的输出在序列化之后写入磁盘(键和值实现为Writable使得他们能够被复制和序列化), * 等待被洗牌到一个可能完全不同的节点上.形式上认为这个过程采用的是值传递(passed by value) * 发送的是键值对的副本. * 在目前的情况下我们可以将一个Mapper与另一个相链接,在相同的JVM线程中一起执行. * 因此,键/值对的发送有可能采用引用传递(passed by reference), * 初始Mapper的输出放到内存中,后续的Mapper直接引用相同的内存位置. * 当Mapper1调用OutputCollector.collect(K k,V v)时,对象k和v直接传递给Map2的map()方法. * mapper之间可能有大量的数据需要传递,避免去复制这些数据可以让性能得以提高. * 但是,这样会违背Hadoop中MapReduceApi的一个更为微妙的"约定",即对OutputCollector.collect(K k,V v) * 的调用一定不会改变k和v的内容. * Map1调用OutputCollector.collect(K k,V v)之后,可以继续使用对象k和v,并完全相信他们的值会保持不变. * 但如果我们将这些对象通过引用传递给Map2,接下来Map2可能会改变他们,这就违反了API的"约定". * 如果你确信Map1的map()方法在调用OutputCollector.collect(K k,V v)之后不再使用k和v的内容, * 或者Map2并不改变k和v的在其上的输入值,你可以通过设定byValue为false来获得一定的性能提升. * 如果你对Mapper的内部代码不太了解,安全起见最好设byValue为true,依旧采用值传递模式, * 确保mapper会按预期的方式工作. * */ JobConf map1Conf = new JobConf(false); ChainMapper.addMapper(job, Map1.class, LongWritable.class, Text.class, Text.class, Text.class, true, map1Conf); /** * 在作业中添加Map2阶段 * 使用ChainMapper.addMapper()添加位于Reduce之前的所有步骤 */ JobConf map2Conf = new JobConf(false); ChainMapper.addMapper(job, Map2.class, Text.class, Text.class, LongWritable.class, Text.class, true, map2Conf); /** * 在作业中添加Reduce阶段 * 使用静态的ChainReducer.setReducer()方法设置reducer */ JobConf reduceConf = new JobConf(false); ChainReducer.setReducer(job, Reduce.class, LongWritable.class, Text.class, Text.class, Text.class, true, reduceConf); /** * 在作业中添加Map3阶段 * 使用ChainReducer.addMapper()添加reducer后续的步骤 */ JobConf map3Conf = new JobConf(false); ChainReducer.addMapper(job, Map3.class, Text.class, Text.class, LongWritable.class, Text.class, true, map3Conf); /** * 在作业中添加Map4阶段 * 使用ChainReducer.addMapper()添加reducer后续的步骤 */ JobConf map4Conf = new JobConf(false); ChainReducer.addMapper(job, Map4.class, LongWritable.class, Text.class, LongWritable.class, Text.class, true, map4Conf); JobClient.runJob(job); return 0; } public static void main(String[] args) throws Exception { final String inputPath = "/home/dev/hadooptest/mapin/cite"; final String outputPath = "/home/dev/hadooptest/mapin/cite/out"; String[] paths = { inputPath, outputPath }; /** * Driver中的main函数->ToolRunner中的run函数->Too接口中的run函数-> * Driver中覆盖函数处理参数->Driver中核心函数启动job(合并为一个方法,重写了接口Tool的run方法) */ int res = ToolRunner.run(new Configuration(), new MyJobLink(), paths); System.exit(res); } }
posted on 2012-06-20 15:24 要么牛逼,要么滚蛋 阅读(1137) 评论(1) 编辑 收藏 举报