Mapreduce词频统计

maven建立quick-start工程。

pom.xml 

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

  <modelVersion>4.0.0</modelVersion>

 

  <groupId>cn.edu.bupt.wcy</groupId>

  <artifactId>wordcount</artifactId>

  <version>0.0.1-SNAPSHOT</version>

  <packaging>jar</packaging>

 

  <name>wordcount</name>

  <url>http://maven.apache.org</url>

 

  <properties>

    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

  </properties>

 

  <dependencies>

    <dependency>

      <groupId>junit</groupId>

      <artifactId>junit</artifactId>

      <version>3.8.1</version>

      <scope>test</scope>

    </dependency>

    <dependency>

            <groupId>org.apache.hadoop</groupId>

            <artifactId>hadoop-common</artifactId>

            <version>2.7.1</version>

        </dependency>

        <dependency>

            <groupId>org.apache.hadoop</groupId>

            <artifactId>hadoop-hdfs</artifactId>

            <version>2.7.1</version>

        </dependency>

        <dependency>

            <groupId>org.apache.hadoop</groupId>

            <artifactId>hadoop-client</artifactId>

            <version>2.7.1</version>

        </dependency>

  </dependencies>

</project>

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

3java代码,mapperreducerrunner主类:

mapper

package cn.edu.bupt.wcy.wordcount;

 

import java.io.IOException;

 

import org.apache.commons.lang.StringUtils;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

 

 

public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{

 

@Override

protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context)

throws IOException, InterruptedException {

// TODO Auto-generated method stub

//super.map(key, value, context);

//String[] words = StringUtils.split(value.toString());

  String[] words = StringUtils.split(value.toString(), " ");

for(String word:words)

{

  context.write(new Text(word), new LongWritable(1));

 

}

}

}

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

reducer:

package cn.edu.bupt.wcy.wordcount;

 

import java.io.IOException;

 

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

 

public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {

 

@Override

protected void reduce(Text arg0, Iterable<LongWritable> arg1,

Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {

// TODO Auto-generated method stub

//super.reduce(arg0, arg1, arg2);

int sum=0;

for(LongWritable num:arg1)

{

sum += num.get();

 

}

context.write(arg0,new LongWritable(sum));

 

 

}

}

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

runner:

package cn.edu.bupt.wcy.wordcount;

 

import java.io.IOException;

 

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

 

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

 

public class WordCountRunner {

 

public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {

Configuration conf = new Configuration();  

    Job job = new Job(conf);  

    job.setJarByClass(WordCountRunner.class);  

    job.setJobName("wordcount");  

    job.setOutputKeyClass(Text.class);  

    job.setOutputValueClass(LongWritable.class);  

    job.setMapperClass(WordCountMapper.class);  

    job.setReducerClass(WordCountReducer.class);  

    job.setInputFormatClass(TextInputFormat.class);  

    job.setOutputFormatClass(TextOutputFormat.class);  

    FileInputFormat.addInputPath(job, new Path(args[1]));  

    FileOutputFormat.setOutputPath(job, new Path(args[2]));  

    job.waitForCompletion(true);  

}

 

}

 

 

 

 

 

 

 

 

 

 

 

 

 

打包成jar包后,放到集群上运行。先在集群上新建一个文件夹:

hdfs dfs -mkdir /input_wordcount 再放入单词文件,比如:

hello world 

I like playing basketball

hello java。。。

运行hadoop jar WordCount.jar(jar包) WordCountRunner(主类) /input_wordcount /output_wordcount

运行完成后,查看:

hdfs dfs -ls /output_wordcount。已经生成了结果,在cat一下查看内容即可。

posted on 2021-10-23 17:04  风中明月  阅读(99)  评论(0编辑  收藏  举报