mapreduce开发 -- 组合式MR作业
迭代MapReduce任务
主控程序main方法中做循环,当达到某条件终止循环;
顺序组合式MapReduce任务
多个mapreduce依次执行,前者输出作为后者输入;job.waitForCompletion(true);保证按顺序一个job执行完成才执行下一个;
1 package action.hadoop.chapter5.section6.demo562; 2 3 import org.apache.hadoop.conf.Configuration; 4 import org.apache.hadoop.fs.Path; 5 import org.apache.hadoop.mapreduce.Job; 6 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 7 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 8 9 public class Shunxu { 10 11 public static void main(String[] args) throws Exception { 12 Path job1inpath=new Path(args[0]); 13 Path job1outpath=new Path(args[1]); 14 Path job2outpath=new Path(args[2]); 15 Path job3outpath=new Path(args[3]); 16 17 Configuration conf1 = new Configuration(); 18 Job job1 = new Job(conf1, "job1"); 19 job1.setJarByClass(Shunxu.class); 20 FileInputFormat.addInputPath(job1, job1inpath); 21 FileOutputFormat.setOutputPath(job1, job1outpath); 22 job1.waitForCompletion(true); 23 24 Configuration conf2 = new Configuration(); 25 Job job2 = new Job(conf2, "job2"); 26 job2.setJarByClass(Shunxu.class); 27 FileInputFormat.addInputPath(job2, job1outpath);// 28 FileOutputFormat.setOutputPath(job2, job2outpath); 29 job2.waitForCompletion(true); 30 31 Configuration conf3 = new Configuration(); 32 Job job3 = new Job(conf3, "job3"); 33 job3.setJarByClass(Shunxu.class); 34 FileInputFormat.addInputPath(job3, job2outpath); 35 FileOutputFormat.setOutputPath(job3, job3outpath); 36 job3.waitForCompletion(true); 37 } 38 39 }
依赖关系组合式MapReduce任务
新API,一个MR任务输入依赖其他MR任务输出。参考源代码测试类 TestMapReduceJobControlWithMocks 和 TestControlledJob;
1 package action.hadoop.chapter5.section6.demo563; 2 3 import org.apache.hadoop.conf.Configuration; 4 import org.apache.hadoop.fs.Path; 5 import org.apache.hadoop.mapreduce.Job; 6 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 7 import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob; 8 import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl; 9 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 11 public class Depending { 12 public static void main(String[] args) throws Exception { 13 Path job1inpath=new Path(args[0]); 14 Path job1outpath=new Path(args[1]); 15 Path job2inpath=new Path(args[2]); 16 Path job2outpath=new Path(args[3]); 17 Path job3outpath=new Path(args[4]); 18 19 Configuration confx = new Configuration(); 20 Job job1=new Job(confx); 21 job1.setJarByClass(Depending.class); 22 FileInputFormat.addInputPath(job1, job1inpath); 23 FileOutputFormat.setOutputPath(job1, job1outpath); 24 ControlledJob jobx = new ControlledJob(job1,null); 25 jobx.setJobName("jobx"); 26 27 Configuration confy = new Configuration(); 28 Job job2=new Job(confy); 29 job2.setJarByClass(Depending.class); 30 FileInputFormat.addInputPath(job2, job2inpath); 31 FileOutputFormat.setOutputPath(job2, job2outpath); 32 ControlledJob joby = new ControlledJob(job2,null); 33 joby.setJobName("joby"); 34 35 Configuration confz = new Configuration(); 36 Job job3=new Job(confz); 37 job3.setJarByClass(Depending.class); 38 FileInputFormat.addInputPath(job3, job1outpath);// 39 FileInputFormat.addInputPath(job3, job2outpath);// 40 FileOutputFormat.setOutputPath(job3, job3outpath); 41 ControlledJob jobz = new ControlledJob(job3,null); 42 jobz.setJobName("jobz"); 43 44 //依赖 45 jobz.addDependingJob(jobx); 46 jobz.addDependingJob(joby); 47 48 JobControl jobControl=new JobControl("jobzdependonjobxy"); 49 jobControl.addJob(jobx); 50 jobControl.addJob(joby); 51 jobControl.addJob(jobz); 52 Thread controller = new Thread(jobControl); 53 controller.start(); 54 while (!jobControl.allFinished()) { 55 try { 56 Thread.sleep(100); 57 } catch (InterruptedException e) {} 58 } 59 System.out.println("jobx.getJobState():"+jobx.getJobState()); 60 System.out.println("joby.getJobState():"+joby.getJobState()); 61 System.out.println("jobz.getJobState():"+jobz.getJobState()); 62 jobControl.stop(); 63 } 64 65 }
旧API,一个MR任务输入依赖其他MR任务输出。
1 package action.hadoop.chapter5.section6.demo563; 2 3 import org.apache.hadoop.conf.Configuration; 4 import org.apache.hadoop.fs.Path; 5 import org.apache.hadoop.mapred.FileInputFormat; 6 import org.apache.hadoop.mapred.FileOutputFormat; 7 import org.apache.hadoop.mapred.JobConf; 8 import org.apache.hadoop.mapred.jobcontrol.Job; 9 import org.apache.hadoop.mapred.jobcontrol.JobControl; 10 11 public class Depending_OldAPI { 12 public static void main(String[] args) throws Exception { 13 Path job1inpath=new Path(args[0]); 14 Path job1outpath=new Path(args[1]); 15 Path job2inpath=new Path(args[2]); 16 Path job2outpath=new Path(args[3]); 17 Path job3outpath=new Path(args[4]); 18 19 Configuration conf = new Configuration(); 20 JobConf confx=new JobConf(conf,Depending_OldAPI.class); 21 FileInputFormat.addInputPath(confx, job1inpath); 22 FileOutputFormat.setOutputPath(confx, job1outpath); 23 Job jobx = new Job(confx); 24 jobx.setJobName("jobx"); 25 26 JobConf confy=new JobConf(conf,Depending_OldAPI.class); 27 FileInputFormat.addInputPath(confy, job2inpath); 28 FileOutputFormat.setOutputPath(confy, job2outpath); 29 Job joby = new Job(confy); 30 jobx.setJobName("joby"); 31 32 JobConf confz=new JobConf(conf,Depending_OldAPI.class); 33 FileInputFormat.addInputPath(confz, job1outpath); 34 FileInputFormat.addInputPath(confz, job2outpath); 35 FileOutputFormat.setOutputPath(confz, job3outpath); 36 Job jobz = new Job(confz); 37 jobx.setJobName("jobz"); 38 39 //依赖 40 jobz.addDependingJob(jobx); 41 jobz.addDependingJob(joby); 42 43 JobControl jobControl=new JobControl("jobzdependonjobxy"); 44 jobControl.addJob(jobx); 45 jobControl.addJob(joby); 46 jobControl.addJob(jobz); 47 Thread theController = new Thread(jobControl); 48 theController.start(); 49 while (!jobControl.allFinished()) { 50 try { 51 Thread.sleep(500); 52 } catch (Exception e) {} 53 } 54 System.out.println("Some jobs failed:"+jobControl.getFailedJobs().size()); 55 jobControl.stop(); 56 } 57 58 }
专用于完成Map和Reduce主过程前处理和后处理的链式MapReduce任务
1 package action.hadoop.chapter5.section6.demo564; 2 3 import java.io.IOException; 4 import java.util.Iterator; 5 import java.util.StringTokenizer; 6 7 import org.apache.hadoop.fs.Path; 8 import org.apache.hadoop.io.IntWritable; 9 import org.apache.hadoop.io.LongWritable; 10 import org.apache.hadoop.io.Text; 11 import org.apache.hadoop.mapred.FileInputFormat; 12 import org.apache.hadoop.mapred.FileOutputFormat; 13 import org.apache.hadoop.mapred.JobConf; 14 import org.apache.hadoop.mapred.MapReduceBase; 15 import org.apache.hadoop.mapred.Mapper; 16 import org.apache.hadoop.mapred.OutputCollector; 17 import org.apache.hadoop.mapred.Reducer; 18 import org.apache.hadoop.mapred.Reporter; 19 import org.apache.hadoop.mapred.TextInputFormat; 20 import org.apache.hadoop.mapred.TextOutputFormat; 21 import org.apache.hadoop.mapred.jobcontrol.Job; 22 import org.apache.hadoop.mapred.jobcontrol.JobControl; 23 import org.apache.hadoop.mapred.lib.ChainMapper; 24 import org.apache.hadoop.mapred.lib.ChainReducer; 25 26 public class Chain { 27 28 public static void main(String[] args) throws Exception { 29 Path jobinpath=new Path(args[0]); 30 Path joboutpath=new Path(args[1]); 31 32 JobConf job = new JobConf(Chain.class); 33 job.setJobName("Chain"); 34 job.setInputFormat(TextInputFormat.class); 35 job.setOutputFormat(TextOutputFormat.class); 36 FileInputFormat.setInputPaths(job, jobinpath); 37 FileOutputFormat.setOutputPath(job, joboutpath); 38 39 JobConf mapper1 = new JobConf(); 40 JobConf mapper2 = new JobConf(); 41 JobConf mapper3 = new JobConf(); 42 JobConf mapper4 = new JobConf(); 43 JobConf reducerConf = new JobConf(); 44 45 ChainMapper.addMapper(job, Mappper1.class, LongWritable.class, Text.class, 46 LongWritable.class, Text.class, true, mapper1); 47 ChainMapper.addMapper(job, Mappper2.class, LongWritable.class, Text.class, 48 Text.class, IntWritable.class, true, mapper2); 49 ChainReducer.setReducer(job, MyReducer.class, Text.class, IntWritable.class, 50 Text.class, IntWritable.class, true, reducerConf); 51 ChainReducer.addMapper(job, Mappper3.class, Text.class, IntWritable.class, 52 Text.class, IntWritable.class, true, mapper3); 53 ChainReducer.addMapper(job, Mappper4.class, Text.class, IntWritable.class, 54 Text.class, IntWritable.class, true, mapper4); 55 56 Job job1 = new Job(job); 57 JobControl jobControl = new JobControl("test"); 58 jobControl.addJob(job1); 59 Thread controller = new Thread(jobControl); 60 controller.start(); 61 while (!jobControl.allFinished()) { 62 try { 63 Thread.sleep(300); 64 } catch (InterruptedException e) { 65 } 66 } 67 jobControl.stop(); 68 69 } 70 /** 71 * 去掉特殊字符 72 * @author connor 73 * 74 */ 75 public static class Mappper1 extends MapReduceBase implements 76 Mapper<LongWritable, Text, LongWritable, Text>{ 77 Text line=new Text(); 78 @Override 79 public void map(LongWritable key, Text value, 80 OutputCollector<LongWritable, Text> output, Reporter reporter) 81 throws IOException { 82 line.set(cutSpecialChar(value.toString())); 83 output.collect(key, line); 84 } 85 public String cutSpecialChar(String str){ 86 return str.replaceAll(",", " ").replaceAll("(", " ").replaceAll("-", " ").replaceAll("“", " "); 87 } 88 } 89 /** 90 * 分词计数 91 * @author connor 92 * 93 */ 94 public static class Mappper2 extends MapReduceBase implements 95 Mapper<LongWritable, Text, Text, IntWritable>{ 96 Text word=new Text(); 97 IntWritable one=new IntWritable(1); 98 @Override 99 public void map(LongWritable key, Text value, 100 OutputCollector<Text, IntWritable> output, Reporter reporter) 101 throws IOException { 102 StringTokenizer itr = new StringTokenizer(value.toString()); 103 while (itr.hasMoreTokens()) { 104 word.set(itr.nextToken().toLowerCase()); 105 output.collect(word, one); 106 } 107 108 } 109 110 } 111 112 public static class MyReducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable>{ 113 private IntWritable result = new IntWritable(); 114 @Override 115 public void reduce(Text key, Iterator<IntWritable> values, 116 OutputCollector<Text, IntWritable> output, Reporter reporter) 117 throws IOException { 118 int sum = 0; 119 while(values.hasNext()) { 120 sum += values.next().get(); 121 } 122 result.set(sum); 123 output.collect(key, result); 124 } 125 126 } 127 128 public static class Mappper3 extends MapReduceBase implements Mapper<Text, IntWritable, Text, IntWritable>{ 129 130 @Override 131 public void map(Text key, IntWritable value, 132 OutputCollector<Text, IntWritable> output, Reporter reporter) 133 throws IOException { 134 key.set("["+key.toString()+"]"); 135 output.collect(key, value); 136 } 137 138 } 139 140 public static class Mappper4 extends MapReduceBase implements Mapper<Text, IntWritable, Text, IntWritable>{ 141 142 @Override 143 public void map(Text key, IntWritable value, 144 OutputCollector<Text, IntWritable> output, Reporter reporter) 145 throws IOException { 146 String word=key.toString(); 147 if(word.length()<20){ 148 for(int i=20-word.length();i>0;i--){ 149 word+=" "; 150 } 151 } 152 key.set(word); 153 output.collect(key, value); 154 } 155 156 } 157 158 }
posted on 2014-03-13 12:58 康纳(connor) 阅读(303) 评论(1) 编辑 收藏 举报