9.19 MapReduce实例

例子：WordCount v2.0

这里是一个更全面的WordCount例子，它使用了我们已经讨论过的很多Map/Reduce框架提供的功能。

运行这个例子需要HDFS的某些功能，特别是 DistributedCache相关功能。因此这个例子只能运行在伪分布式或者完全分布式模式的 Hadoop上。

源代码

	WordCount.java
1.	package org.myorg;
2.
3.	import java.io.*;
4.	import java.util.*;
5.
6.	import org.apache.hadoop.fs.Path;
7.	import org.apache.hadoop.filecache.DistributedCache;
8.	import org.apache.hadoop.conf.*;
9.	import org.apache.hadoop.io.*;
10.	import org.apache.hadoop.mapred.*;
11.	import org.apache.hadoop.util.*;
12.
13.	public class WordCount extends Configured implements Tool {
14.
15.	public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
16.
17.	static enum Counters { INPUT_WORDS }
18.
19.	private final static IntWritable one = new IntWritable(1);
20.	private Text word = new Text();
21.
22.	private boolean caseSensitive = true;
23.	private Set<String> patternsToSkip = new HashSet<String>();
24.
25.	private long numRecords = 0;
26.	private String inputFile;
27.
28.	public void configure(JobConf job) {
29.	caseSensitive = job.getBoolean("wordcount.case.sensitive", true);
30.	inputFile = job.get("map.input.file");
31.
32.	if (job.getBoolean("wordcount.skip.patterns", false)) {
33.	Path[] patternsFiles = new Path[0];
34.	try {
35.	patternsFiles = DistributedCache.getLocalCacheFiles(job);
36.	} catch (IOException ioe) {
37.	System.err.println("Caught exception while getting cached files: " + StringUtils.stringifyException(ioe));
38.	}
39.	for (Path patternsFile : patternsFiles) {
40.	parseSkipFile(patternsFile);
41.	}
42.	}
43.	}
44.
45.	private void parseSkipFile(Path patternsFile) {
46.	try {
47.	BufferedReader fis = new BufferedReader(new FileReader(patternsFile.toString()));
48.	String pattern = null;
49.	while ((pattern = fis.readLine()) != null) {
50.	patternsToSkip.add(pattern);
51.	}
52.	} catch (IOException ioe) {
53.	System.err.println("Caught exception while parsing the cached file '" + patternsFile + "' : " + StringUtils.stringifyException(ioe));
54.	}
55.	}
56.
57.	public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
58.	String line = (caseSensitive) ? value.toString() : value.toString().toLowerCase();
59.
60.	for (String pattern : patternsToSkip) {
61.	line = line.replaceAll(pattern, "");
62.	}
63.
64.	StringTokenizer tokenizer = new StringTokenizer(line);
65.	while (tokenizer.hasMoreTokens()) {
66.	word.set(tokenizer.nextToken());
67.	output.collect(word, one);
68.	reporter.incrCounter(Counters.INPUT_WORDS, 1);
69.	}
70.
71.	if ((++numRecords % 100) == 0) {
72.	reporter.setStatus("Finished processing " + numRecords + " records " + "from the input file: " + inputFile);
73.	}
74.	}
75.	}
76.
77.	public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
78.	public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
79.	int sum = 0;
80.	while (values.hasNext()) {
81.	sum += values.next().get();
82.	}
83.	output.collect(key, new IntWritable(sum));
84.	}
85.	}
86.
87.	public int run(String[] args) throws Exception {
88.	JobConf conf = new JobConf(getConf(), WordCount.class);
89.	conf.setJobName("wordcount");
90.
91.	conf.setOutputKeyClass(Text.class);
92.	conf.setOutputValueClass(IntWritable.class);
93.
94.	conf.setMapperClass(Map.class);
95.	conf.setCombinerClass(Reduce.class);
96.	conf.setReducerClass(Reduce.class);
97.
98.	conf.setInputFormat(TextInputFormat.class);
99.	conf.setOutputFormat(TextOutputFormat.class);
100.
101.	List<String> other_args = new ArrayList<String>();
102.	for (int i=0; i < args.length; ++i) {
103.	if ("-skip".equals(args[i])) {
104.	DistributedCache.addCacheFile(new Path(args[++i]).toUri(), conf);
105.	conf.setBoolean("wordcount.skip.patterns", true);
106.	} else {
107.	other_args.add(args[i]);
108.	}
109.	}
110.
111.	FileInputFormat.setInputPaths(conf, new Path(other_args.get(0)));
112.	FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));
113.
114.	JobClient.runJob(conf);
115.	return 0;
116.	}
117.
118.	public static void main(String[] args) throws Exception {
119.	int res = ToolRunner.run(new Configuration(), new WordCount(), args);
120.	System.exit(res);
121.	}
122.	}
123.

运行样例

输入样例：

$ bin/hadoop dfs -ls /usr/joe/wordcount/input/
/usr/joe/wordcount/input/file01
/usr/joe/wordcount/input/file02

$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file01
Hello World, Bye World!

$ bin/hadoop dfs -cat /usr/joe/wordcount/input/file02
Hello Hadoop, Goodbye to hadoop.

运行程序：

$ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount /usr/joe/wordcount/input /usr/joe/wordcount/output

输出：

$ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000
Bye 1
Goodbye 1
Hadoop, 1
Hello 2
World! 1
World, 1
hadoop. 1
to 1

注意此时的输入与第一个版本的不同，输出的结果也有不同。

现在通过DistributedCache插入一个模式文件，文件中保存了要被忽略的单词模式。

$ hadoop dfs -cat /user/joe/wordcount/patterns.txt
\.
\,
\!
to

再运行一次，这次使用更多的选项：

$ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount -Dwordcount.case.sensitive=true /usr/joe/wordcount/input /usr/joe/wordcount/output -skip /user/joe/wordcount/patterns.txt

应该得到这样的输出：

$ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000
Bye 1
Goodbye 1
Hadoop 1
Hello 2
World 2
hadoop 1

再运行一次，这一次关闭大小写敏感性（case-sensitivity）：

$ bin/hadoop jar /usr/joe/wordcount.jar org.myorg.WordCount -Dwordcount.case.sensitive=false /usr/joe/wordcount/input /usr/joe/wordcount/output -skip /user/joe/wordcount/patterns.txt

输出：

$ bin/hadoop dfs -cat /usr/joe/wordcount/output/part-00000
bye 1
goodbye 1
hadoop 2
hello 2
world 2

程序要点

通过使用一些Map/Reduce框架提供的功能，WordCount的第二个版本在原始版本基础上有了如下的改进：

展示了应用程序如何在Mapper (和Reducer)中通过configure方法修改配置参数(28-43行)。
展示了作业如何使用DistributedCache 来分发只读数据。这里允许用户指定单词的模式，在计数时忽略那些符合模式的单词(104行)。
展示Tool接口和GenericOptionsParser处理Hadoop命令行选项的功能 (87-116, 119行)。
展示了应用程序如何使用Counters(68行)，如何通过传递给map（和reduce）方法的Reporter实例来设置应用程序的状态信息(72行)。

posted @ 2021-09-19 08:24 While！true 阅读(48) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

While！true

9.19 MapReduce实例

例子：WordCount v2.0

源代码

运行样例

程序要点

公告