一个可以跑的Hadoop的WordCount程序

搭个新环境时总要折腾一下，于是干脆记下来。

程序：

package  com.my;  
  
import  java.io.IOException;  
import  java.util.Iterator;  
import  java.util.StringTokenizer;  
  
import  org.apache.hadoop.fs.Path;  
import  org.apache.hadoop.io.IntWritable;  
import  org.apache.hadoop.io.LongWritable;  
import  org.apache.hadoop.io.Text;  
import  org.apache.hadoop.mapred.FileInputFormat;  
import  org.apache.hadoop.mapred.FileOutputFormat;  
import  org.apache.hadoop.mapred.JobClient;  
import  org.apache.hadoop.mapred.JobConf;  
import  org.apache.hadoop.mapred.MapReduceBase;  
import  org.apache.hadoop.mapred.Mapper;  
import  org.apache.hadoop.mapred.OutputCollector;  
import  org.apache.hadoop.mapred.Reducer;  
import  org.apache.hadoop.mapred.Reporter;  
import  org.apache.hadoop.mapred.TextInputFormat;  
import  org.apache.hadoop.mapred.TextOutputFormat;  
public   class  WordCount  
{  
  
    public   static   class  Map  extends  MapReduceBase  implements   
            Mapper<LongWritable, Text, Text, IntWritable>  
    {  
        private   final   static  IntWritable one =  new  IntWritable( 1 );  
        private  Text word =  new  Text();  
          
        public   void  map(LongWritable key, Text value,  
                OutputCollector<Text, IntWritable> output, Reporter reporter)  
                throws  IOException  
        {  
            String line = value.toString();  
            StringTokenizer tokenizer = new  StringTokenizer(line);  
            while  (tokenizer.hasMoreTokens())  
            {  
                word.set(tokenizer.nextToken());  
                output.collect(word, one);  
            }  
        }  
    }  
  
    public   static   class  Reduce  extends  MapReduceBase  implements   
            Reducer<Text, IntWritable, Text, IntWritable>  
    {  
        public   void  reduce(Text key, Iterator<IntWritable> values,  
                OutputCollector<Text, IntWritable> output, Reporter reporter)  
                throws  IOException  
        {  
            int  sum =  0 ;  
            while  (values.hasNext())  
            {  
                sum += values.next().get();  
            }  
            output.collect(key, new  IntWritable(sum));  
        }  
    }  
  
    public   static   void  main(String[] args)  throws  Exception  
    {  
        JobConf conf = new  JobConf(WordCount. class );  
        conf.setJobName("wordcount" ); 
  
        conf.setOutputKeyClass(Text.class );
        conf.setOutputValueClass(IntWritable.class );
  
        conf.setMapperClass(Map.class );
        conf.setCombinerClass(Reduce.class );
        conf.setReducerClass(Reduce.class ); 
  
        conf.setInputFormat(TextInputFormat.class );
        conf.setOutputFormat(TextOutputFormat.class );
  
        FileInputFormat.setInputPaths(conf, new  Path(args[ 0 ]));  
        FileOutputFormat.setOutputPath(conf, new  Path(args[ 1 ]));  
  
        JobClient.runJob(conf); 
    }  
}

编译命令：

mkdir Myjava

javac -classpath hadoop-core-1.1.2.jar -d Myjava WordCount.java

jar -cvf WordCount.jar -C Myjava .

运行命令：

bin/hadoop jar WordCount.jar com.my.WordCount /src/test.txt /output

这一次的是基于hadoop 1.1.2程序。

posted @ 2013-07-25 22:20 双子靓星阅读(389) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

双子靓星

一个可以跑的Hadoop的WordCount程序

公告