mapreduce程序编写(WordCount)

折腾了半天。终于编写成功了第一个自己的mapreduce程序,并通过打jar包的方式运行起来了。

运行环境:

windows 64bit 

eclipse 64bit

jdk6.0 64bit

一、工程准备

1、新建java project

2、导入jar包

新建一个user library 把hadoop文件夹里的hadoop-core和lib包里的所有包都导入进来,以免出错。

二、编码

1、主要是计算单词的小程序,测试用

package com.hirra;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {
    //嵌套类 Mapper  
    //Mapper<keyin,valuein,keyout,valueout>  
    public static class WordCountMapper extends Mapper<Object, Text, Text, IntWritable>{  
        private final static IntWritable one = new IntWritable(1);  
        private Text word = new Text();  
          
        @Override  
        protected void map(Object key, Text value, Context context)  
                throws IOException, InterruptedException {  
            StringTokenizer itr = new StringTokenizer(value.toString());  
            while(itr.hasMoreTokens()){  
                word.set(itr.nextToken());  
                context.write(word, one);//Context机制  
            }  
        }  
    }  
      
      
    //嵌套类Reducer  
    //Reduce<keyin,valuein,keyout,valueout>  
    //Reducer的valuein类型要和Mapper的va lueout类型一致,Reducer的valuein是Mapper的valueout经过shuffle之后的值  
    public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{  
        private IntWritable result = new IntWritable();  
  
        @Override  
        protected void reduce(Text key, Iterable<IntWritable> values,  
                Context context)  
                throws IOException, InterruptedException {  
            int sum  = 0;  
            for(IntWritable i:values){  
                sum += i.get();  
            }  
            result.set(sum);  
            context.write(key,result);//Context机制  
        }  
  
          
          
    }  
      
    public static void main(String[] args) throws Exception{  
        Configuration conf = new Configuration();//获得Configuration配置 Configuration: core-default.xml, core-site.xml 
     //很关键
     conf.set("mapred.job.tracker", "hadoopmaster:9001");     String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();//获得输入参数[hdfs://localhost:9000/user/dat/input, hdfs://localhost:9000/user/dat/output] if(otherArgs.length != 2){//判断输入参数个数,不为两个异常退出 System.err.println("Usage:wordcount <in> <out>"); System.exit(2); } ////设置Job属性 Job job = new Job(conf,"word count"); job.setJarByClass(WordCount.class); job.setMapperClass(WordCountMapper.class); job.setCombinerClass(WordCountReducer.class);//将结果进行局部合并 job.setReducerClass(WordCountReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0]));//传入input path FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//传入output path,输出路径应该为空,否则报错org.apache.hadoop.mapred.FileAlreadyExistsException。 System.exit(job.waitForCompletion(true)?0:1);//是否正常退出 } }

2、注意问题

有些jar包没导入会出现问题

三、生成jar包

1、eclipse自带功能export jar包

四、运行

1、ssh client工具导入至linux

2、hadoop运行,转到hadoop的bin目录下,执行下面指令:

./hadoop jar test.jar /README.txt /usr/dat/output  

3、注意问题

output目录必须是之前不存在的路径。

 

posted @ 2015-09-13 22:13  人生设计师  阅读(627)  评论(0编辑  收藏  举报