MapReduce计数程序(自主复习)
1.MyWordCount类
注意:
1.本机+测试,两个注释都放开
2.本机跑集群,要开异构平台为true
3.集群跑,把两个注释都注起来,然后在集群上面跑
package com.littlepage.wc; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import java.io.IOException; public class MyWordCount { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { //1.读取配置 Configuration conf=new Configuration(true); //设定本地环境运行,不进行集群运行 // conf.set("mapreduce.framework.name","local"); //设定异构平台 // conf.set("mapreduce.app-submission.cross-platform","true"); //2.设定Job Job job=Job.getInstance(conf); //3.设定Job执行的类 job.setJarByClass(MyWordCount.class); //4.设定JobName job.setJobName("SteveYu's word count"); //5.设定输入path Path infile=new Path("/data/wc/input"); TextInputFormat.addInputPath(job,infile); //6.设定输出path Path outfile=new Path("/data/wc/loveloveOutput"); if(outfile.getFileSystem(conf).exists(outfile)) outfile.getFileSystem(conf).delete(outfile,true); TextOutputFormat.setOutputPath(job,outfile); //7.设定MapperClass和ReduceClass job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); //8.设定输出的Key,Value格式 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //9.等待程序完成 job.waitForCompletion(true); } }
2.WordCountMapper类
作用:
定义一个拆分文本的功能,将Mapper进行拆分成key, value的形式
package com.littlepage.wc; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; import java.util.StringTokenizer; public class WordCountMapper extends Mapper<Object,Text,Text,IntWritable> { private final static IntWritable one=new IntWritable(1); private Text word=new Text(); @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr=new StringTokenizer(value.toString()); while(itr.hasMoreTokens()){ word.set(itr.nextToken()); context.write(word,one); } } }
3.WordCountReducer类
作用:
进行第二次映射计算
package com.littlepage.wc; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class WordCountReducer extends Reducer<Text, IntWritable,Text,IntWritable> { private IntWritable result=new IntWritable(); //相同的key为一组 ,这一组数据调用一次reduce //hello 1 @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum=0; for (IntWritable val:values) { sum+=val.get(); } result.set(sum); context.write(key,result); } }
4.单机跑可能出现的问题
1.hadoop必须解压
2.hadoop必须配置HADOOP_HOME以及环境变量
3.hadoop必须将core-site.xml放进resources文件夹里面,并且文件夹得标识为source文件夹
4.hadoop的bin在windows必须粘贴为windows版本,并且,我们需要把hadoop.dll复制到system32文件夹内,因为system32是存放系统小工具的一个文件夹