Hadoop WordCount(Streaming,Python,Java三合一)
一、Steaming
Map任务:
#!/bin/bash awk 'BEGIN{ FS = "[ ,. ]" OFS = "\t" }{ for( i = 1; i <= NF; ++i) { dict[$i] += 1 } }END{ for( key in dict) { print key,dict[key] } }'
Reducer任务:
#!/bin/bash awk 'BEGIN{ OFS = "\t" }{ dict[$1] += $2 }END{ for(key in dict) { print key,dict[key] } }'
启动脚本:
#!/bin/bash hadoop fs -rm -r /data/apps/zhangwenchao/mapreduce/streaming/wordcount/output hadoop jar /data/tools/hadoop/hadoop-2.6.2/share/hadoop/tools/lib/hadoop-streaming-2.6.2.jar \ -input /data/apps/zhangwenchao/mapreduce/streaming/wordcount/input \ -output /data/apps/zhangwenchao/mapreduce/streaming/wordcount/output \ -mapper "sh -x mapper.sh" \ -reducer "sh -x reducer.sh" \ -file mapper.sh \ -file reducer.sh \ -jobconf mapred.job.name=wordcount \ -jobconf mapred.job.tasks=5 \ -jobconf mapred.reduce.tasks=3
二、Python
Map任务:
#! /usr/bin/python import sys import re for line in sys.stdin: wordlist=re.split('[;,.?]',line) for words in wordlist: words=words.strip() tmp = words.split() for item in tmp: print "%s\t%s" % (item, 1)
Reducer任务:
#!/usr/bin/env python from operator import itemgetter import sys current_word = None current_count = 0 word = None for line in sys.stdin: line = line.strip() word, count = line.split('\t', 1) try: count = int(count) except ValueError: continue if current_word == word: current_count += count else: if current_word: print '%s\t%s' % (current_word,current_count) current_count = count current_word = word if word == current_word: print "%s\t%s" % (current_word, current_count)
启动脚本:
#!/bin/bash hadoop fs -rm -r /data/apps/zhangwenchao/mapreduce/python/wordcount/output hadoop jar /data/tools/hadoop/hadoop-2.6.2/share/hadoop/tools/lib/hadoop-streaming-2.6.2.jar \ -input /data/apps/zhangwenchao/mapreduce/python/wordcount/input \ -output /data/apps/zhangwenchao/mapreduce/python/wordcount/output \ -mapper "mapper.py" \ -reducer "reducer.py" \ -file mapper.py \ -file reducer.py \ -jobconf mapred.job.name=wordcount \ -jobconf mapred.job.tasks=5 \ -jobconf mapred.reduce.tasks=3
三、Java
Map:
import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class MyMap extends Mapper<LongWritable, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } }
Reduce:
import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> { @Override public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } context.write(key, new IntWritable(sum)); } }
Main:
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class Main { public static void main(String[] args) throws Exception { String input = "hdfs://test1:8020/test/**/test/zhangwenchao/java/wordcount/intput"; String output = "hdfs://test1:8020/test/**/test/zhangwenchao/java/wordcount/output"; Configuration conf = new Configuration(); Job job = new Job(conf); job.setJobName("test4"); job.setJarByClass(Main.class); FileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setMapperClass(MyMap.class); job.setReducerClass(MyReduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setCombinerClass(MyReduce.class); job.setNumReduceTasks(3); job.waitForCompletion(true); } }