hadoop示例程序Grep分析

该程序实现的是对指定文档中指定单词的词频进行计算

  1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18 package org.apache.hadoop.examples;
19
20 import java.util.Random;
21
22 import org.apache.hadoop.conf.Configuration;
23 import org.apache.hadoop.conf.Configured;
24 import org.apache.hadoop.fs.FileSystem;
25 import org.apache.hadoop.fs.Path;
26 import org.apache.hadoop.io.LongWritable;
27 import org.apache.hadoop.io.Text;
28 import org.apache.hadoop.mapred.*;
29 import org.apache.hadoop.mapred.lib.*;
30 import org.apache.hadoop.util.Tool;
31 import org.apache.hadoop.util.ToolRunner;
32
33 /* Extracts matching regexs from input files and counts them. */
34 /* 从input中提取与表达式相符的单词并计算词频*/
35 publicclass Grep extends Configured implements Tool {
36 private Grep() {} // singleton
37 /** 继承自配置基类Configured,并扩展接口Tool
38 *Configured类中有一个变量conf用于存储配置文件
39 *Tool中只有一个方法需要实现
40 *int run(String [] args)用于运行输入参数
41 */
42 publicint run(String[] args) throws Exception {
43 if (args.length <3) {
44 System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
45 ToolRunner.printGenericCommandUsage(System.out);
46 return-1;
47 }
48
49 Path tempDir =
50 new Path("grep-temp-"+
51 Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
52
53 JobConf grepJob =new JobConf(getConf(), Grep.class);//创建job
54
55 try {
56
57 grepJob.setJobName("grep-search"); //job命名
58
59 FileInputFormat.setInputPaths(grepJob, args[0]); //设置job的输入路径
60
61 grepJob.setMapperClass(RegexMapper.class); //设置Mapper类
62 grepJob.set("mapred.mapper.regex", args[2]);
63 if (args.length ==4)
64 grepJob.set("mapred.mapper.regex.group", args[3]);
65
66 grepJob.setCombinerClass(LongSumReducer.class); //设置Combiner类
67 grepJob.setReducerClass(LongSumReducer.class); //设置Reducer类
68
69 FileOutputFormat.setOutputPath(grepJob, tempDir); //设置输出路径
70 grepJob.setOutputFormat(SequenceFileOutputFormat.class); //设置输出格式
71 grepJob.setOutputKeyClass(Text.class); //设置输出键的类
72 grepJob.setOutputValueClass(LongWritable.class); //设置输出值的类
73
74 JobClient.runJob(grepJob); //运行
75
76 JobConf sortJob =new JobConf(Grep.class); //排序,内容同上
77 sortJob.setJobName("grep-sort");
78
79 FileInputFormat.setInputPaths(sortJob, tempDir);
80 sortJob.setInputFormat(SequenceFileInputFormat.class);
81
82 sortJob.setMapperClass(InverseMapper.class);
83
84 sortJob.setNumReduceTasks(1); // write a single file
85 FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
86 sortJob.setOutputKeyComparatorClass // sort by decreasing freq
87 (LongWritable.DecreasingComparator.class);
88
89 JobClient.runJob(sortJob);
90 }
91 finally {
92 FileSystem.get(grepJob).delete(tempDir, true);
93 }
94 return0;
95 }
96
97 publicstaticvoid main(String[] args) throws Exception {
98 int res = ToolRunner.run(new Configuration(), new Grep(), args);
99 System.exit(res);
100 }
101
102 }

  

posted on 2011-07-20 10:15  york_hust  阅读(4630)  评论(0编辑  收藏  举报