Hadoop WordCount（Streaming，Python，Java三合一）

一、Steaming

Map任务：

#!/bin/bash
awk 'BEGIN{
        FS = "[ ,.      ]"
        OFS = "\t"
}{
        for( i = 1; i <= NF; ++i)
        {
                dict[$i] += 1
        }
}END{
        for( key in dict)
        {
                print key,dict[key]
        }
}'

Reducer任务：

#!/bin/bash
awk 'BEGIN{
        OFS = "\t"
}{
        dict[$1] += $2
}END{
        for(key in dict)
        {
                print key,dict[key]
        }
}'

启动脚本：

#!/bin/bash

hadoop fs -rm -r /data/apps/zhangwenchao/mapreduce/streaming/wordcount/output

hadoop jar /data/tools/hadoop/hadoop-2.6.2/share/hadoop/tools/lib/hadoop-streaming-2.6.2.jar \
        -input /data/apps/zhangwenchao/mapreduce/streaming/wordcount/input \
        -output /data/apps/zhangwenchao/mapreduce/streaming/wordcount/output \
        -mapper "sh -x mapper.sh" \
        -reducer "sh -x reducer.sh" \
        -file mapper.sh \
        -file reducer.sh \
        -jobconf mapred.job.name=wordcount \
        -jobconf mapred.job.tasks=5 \
        -jobconf mapred.reduce.tasks=3

二、Python

Map任务：

#! /usr/bin/python 
import sys
import re
for line in sys.stdin:
        wordlist=re.split('[;,.?]',line)
        for words in wordlist:
                words=words.strip()
                tmp = words.split()
                for item in tmp:
                        print "%s\t%s" % (item, 1)

Reducer任务：

#!/usr/bin/env python
from operator import itemgetter
import sys
current_word = None
current_count = 0
word = None
for line in sys.stdin:
    line = line.strip()
    word, count = line.split('\t', 1)
    try:
        count = int(count)
    except ValueError:
        continue
    if current_word == word:
        current_count += count
    else:
        if current_word:
                print '%s\t%s' % (current_word,current_count)
        current_count = count
        current_word = word
if word == current_word:
    print "%s\t%s" % (current_word, current_count)

启动脚本：

 #!/bin/bash

hadoop fs -rm -r /data/apps/zhangwenchao/mapreduce/python/wordcount/output

hadoop jar /data/tools/hadoop/hadoop-2.6.2/share/hadoop/tools/lib/hadoop-streaming-2.6.2.jar \
        -input /data/apps/zhangwenchao/mapreduce/python/wordcount/input \
        -output /data/apps/zhangwenchao/mapreduce/python/wordcount/output \
        -mapper "mapper.py" \
        -reducer "reducer.py" \
        -file mapper.py \
        -file reducer.py \
        -jobconf mapred.job.name=wordcount \
        -jobconf mapred.job.tasks=5 \
        -jobconf mapred.reduce.tasks=3

三、Java

Map：

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MyMap extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());

context.write(word, one);
}
}
}

Reduce：

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}

Main：

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Main {
public static void main(String[] args) throws Exception {

String input = "hdfs://test1:8020/test/**/test/zhangwenchao/java/wordcount/intput";
String output = "hdfs://test1:8020/test/**/test/zhangwenchao/java/wordcount/output";
Configuration conf = new Configuration();

Job job = new Job(conf);
job.setJobName("test4");
job.setJarByClass(Main.class);

FileInputFormat.addInputPath(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));

job.setMapperClass(MyMap.class);
job.setReducerClass(MyReduce.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);

job.setCombinerClass(MyReduce.class);

job.setNumReduceTasks(3);

job.waitForCompletion(true);
}
}

posted @ 2018-07-05 18:06 白石江边阅读(285) 评论(0) 编辑收藏举报

刷新页面返回顶部

白石江边

Hadoop WordCount（Streaming，Python，Java三合一）

公告