4.Eclipse下的Hadoop开发实践

前三篇文章为Hadoop环境配置和命令行运行程序及查看结果等等。

安装eclipse

注：只需在master上安装即可

在www.eclipse.org下载linux对应32位安装包，在根目录新建eclipse文件夹

拷贝eclipse-java-mars-R-linux-gtk.tar.gz到eclipse目录下，并解压。

运行eclipse

这样eclipse就安装好了。

接下来安装eclipse下的hadoop-2.2.0插件

下载插件解压放到eclipse的plugin目录下，重启eclipse

http://yun.baidu.com/share/link?shareid=4030904860&uk=3678057217

配置hadoop installation directory

打开Window-->Preferences，找到Hadoop Map/Reduce选项，在这个选项里你需要配置Hadoop installation directory。配置完成后退出。

选择目录/usr/local/hadoop/hadoop-2.2.0，点击applyàok

配置Map/Reduce Locations。在Window-->Show View->other...，在MapReduce Tools中选择Map/Reduce Locations。

在Map/Reduce Locations（Eclipse界面的正下方）中新建一个Hadoop Location

在这个View中，点击鼠标右键-->New Hadoop Location。

在弹出的对话框中你需要配置Location name，可任意填，如Hadoop，以及Map/Reduce Master和DFS Master。这里面的Host、Port分别为你在mapred-site.xml、core-site.xml中配置的地址及端口。

新建一个hadoop项目测试.

新建项目:File-->New-->Other-->Map/Reduce Project 项目名可以随便取，如hadoopTest。

新建测试类，wordCountTest,（复制wordCount源代码）代码如下:

  1 package hadoopTest;
  2 
  3 import java.io.IOException;
  4 
  5 import java.util.StringTokenizer; 
  6 
  7 import org.apache.hadoop.conf.Configuration; 
  8 
  9 import org.apache.hadoop.fs.Path; 
 10 
 11 import org.apache.hadoop.io.IntWritable; 
 12 
 13 import org.apache.hadoop.io.Text; 
 14 
 15 import org.apache.hadoop.mapred.JobConf; 
 16 
 17 import org.apache.hadoop.mapreduce.Job; 
 18 
 19 import org.apache.hadoop.mapreduce.Mapper;
 20 
 21 import org.apache.hadoop.mapreduce.Reducer; 
 22 
 23 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 
 24 
 25 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 26 
 27 import org.apache.hadoop.util.GenericOptionsParser;
 28 
 29 public class wordCountTest {
 30 
 31 /** 
 32 
 33 * MapReduceBase类:实现了Mapper和Reducer接口的基类（其中的方法只是实现接口，而未作任何事情） 
 34 
 35 * Mapper接口：
 36 
 37 * WritableComparable接口：实现WritableComparable的类可以相互比较。所有被用作key的类应该实现此接口。  
 38     * Reporter 则可用于报告整个应用的运行进度，本例中未使用。   
 39     *   
 40     */    
 41 public static class TokenizerMapper   
 42       extends Mapper<Object, Text, Text, IntWritable>{  
 43      /**  
 44       * LongWritable, IntWritable, Text 均是 Hadoop 中实现的用于封装 Java 数据类型的类，这些类实现了WritableComparable接口，  
 45       * 都能够被串行化从而便于在分布式环境中进行数据交换，你可以将它们分别视为long,int,String 的替代品。  
 46       */   
 47    private final static IntWritable one = new IntWritable(1);  
 48    private Text word = new Text();//Text 实现了BinaryComparable类可以作为key值  
 49    /**  
 50     * Mapper接口中的map方法：  
 51     * void map(K1 key, V1 value, OutputCollector<K2,V2> output, Reporter reporter)  
 52     * 映射一个单个的输入k/v对到一个中间的k/v对  
 53     * 输出对不需要和输入对是相同的类型，输入对可以映射到0个或多个输出对。  
 54     * OutputCollector接口：收集Mapper和Reducer输出的<k,v>对。  
 55     * OutputCollector接口的collect(k, v)方法:增加一个(k,v)对到output  
 56     */    
 57      public void map(Object key, Text value, Context context) throws IOException, InterruptedException {  
 58        /** 
 59         * 原始数据： 
 60         * c++ java hello 
 61            world java hello 
 62            you me too 
 63            map阶段，数据如下形式作为map的输入值：key为偏移量 
 64            0  c++ java hello 
 65            16 world java hello 
 66            34 you me too 
 67         */  
 68         /** 
 69          * 以下解析键值对 
 70         * 解析后以键值对格式形成输出数据 
 71         * 格式如下：前者是键排好序的，后者数字是值 
 72         * c++ 1 
 73         * java 1 
 74         * hello 1 
 75         * world 1 
 76         * java 1 
 77         * hello 1 
 78         * you 1 
 79         * me 1 
 80         * too 1 
 81         * 这些数据作为reduce的输出数据 
 82         */  
 83      StringTokenizer itr = new StringTokenizer(value.toString());//得到什么值  
 84      System.out.println("value什么东西 ： "+value.toString());  
 85      System.out.println("key什么东西 ： "+key.toString());  
 86      while (itr.hasMoreTokens()) {  
 87        word.set(itr.nextToken());  
 88        context.write(word, one);  
 89      }  
 90 }  
 91    }
 92 public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {  
 93     private IntWritable result = new IntWritable();  
 94     /** 
 95      * reduce过程是对输入数据解析形成如下格式数据： 
 96      * (c++ [1]) 
 97      * (java [1,1]) 
 98      * (hello [1,1]) 
 99      * (world [1]) 
100      * (you [1]) 
101      * (me [1]) 
102      * (you [1]) 
103      * 供接下来的实现的reduce程序分析数据数据 
104      *  
105      */  
106     public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {  
107       int sum = 0;  
108       /** 
109        * 自己的实现的reduce方法分析输入数据 
110        * 形成数据格式如下并存储 
111        *     c++    1 
112        *    hello   2 
113        *    java    2 
114        *    me      1 
115        *    too     1 
116        *    world   1 
117        *    you     1 
118        *     
119        */  
120       for (IntWritable val : values) {  
121         sum += val.get();  
122       }  
123       result.set(sum);  
124       context.write(key, result);  
125     }  
126   }  
127 public static void main(String[] args) throws Exception {  
128      /**  
129       * JobConf：map/reduce的job配置类，向hadoop框架描述map-reduce执行的工作  
130       * 构造方法：JobConf()、JobConf(Class exampleClass)、JobConf(Configuration conf)等  
131       */    
132    //重点！根据自己的实际情况填写输入分析的目录和结果输出的目录  
133    args = new String[2];
134    args[0] = "hdfs://localhost(ip地址):9000/input";
135    args[1] = "hdfs://localhost(ip地址):9000/output";
136 Configuration conf = new Configuration();  
137    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();  
138    for(String s : otherArgs){
139    System.out.println(s);
140    }
141    //这里需要配置参数即输入和输出的HDFS的文件路径  
142    if (otherArgs.length != 2) {  
143      System.err.println("Usage: wordcount <in> <out>");  
144      System.exit(2);  
145    }  
146   // JobConf conf1 = new JobConf(WordCount.class);  
147    Job job = new Job(conf, "word count");//Job(Configuration conf, String jobName) 设置job名称和  
148    job.setJarByClass(wordCountTest.class);  
149    job.setMapperClass(TokenizerMapper.class); //为job设置Mapper类   
150    job.setCombinerClass(IntSumReducer.class); //为job设置Combiner类    
151    job.setReducerClass(IntSumReducer.class); //为job设置Reduce类     
152    job.setOutputKeyClass(Text.class);        //设置输出key的类型  
153    job.setOutputValueClass(IntWritable.class);//  设置输出value的类型  
154    FileInputFormat.addInputPath(job, new Path(otherArgs[0])); //为map-reduce任务设置InputFormat实现类   设置输入路径  
155    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));//为map-reduce任务设置OutputFormat实现类  设置输出路径  
156    System.exit(job.waitForCompletion(true) ? 0 : 1);  
157 }  
158 }
159 
160 右键选择run java application ，如果执行成功刷新下hdfs的目录会出现 /output目录 结果就在part-r-00000文件

posted on 2015-07-13 15:59 Satchmo丶阅读(394) 评论(0) 收藏举报