单词计数示例
一、代码
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class WordCount extends Configured implements Tool{
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
int exitcode=ToolRunner.run(new WordCount(), args);
System.exit(exitcode);
}
// TODO Auto-generated method stub
private final static IntWritable one = new IntWritable(1);
private static Text word = new Text();
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
@Override
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
word.set(tokenizer.nextToken());
output.collect(word, one);
}
}
}
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,
Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
@Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
if(args.length!=2) {
System.err.printf("Usage %s need <input> <output>\n", getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
System.out.print(args[0]);
JobConf job=new JobConf(getConf());
job.setJarByClass(getClass());
job.setJobName("wordcount");
job.setInputFormat(TextInputFormat.class); //为map-reduce任务设置InputFormat实现类
job.setOutputFormat(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setOutputKeyClass(Text.class); //为job的输出数据设置Key类
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(WordCount.Map.class);
job.setCombinerClass(WordCount.Reduce.class);
job.setReducerClass(WordCount.Reduce.class);
JobClient.runJob(job);
return 0;
}
}
二、执行
1、本地执行
export HADOOP_CONF_DIR=/root/soft/hdp312/localconf/hadoop
hadoop jar wordcount.jar WordCount --input testinput --output testoutput
三、流模式运行
--mapper
--reducer
以上二者可以指定脚本命令或java类都可以
脚本可以对基本输入和输出进行读写
hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.1.2.jar -input testinput/ -output testoutput/ -mapper /bin/cat