javaApi,mapreduce,awk,scala四种方式实现词频统计

awk方式实现词频统计:

方式一:
vi wordcount.awk
{
	for (i = 1; i <=NF;i++) //NF 表示的是浏览记录的域的个数 
		freq[$i]++
}
END{
		for(word in freq) //
			printf "%s%d\n",word,freq[word] //
}
运行:awk -f wordcount.awk words.txt;
----------------------------------
方式二:
运行方式二:
vi wordcount_awk.sh
#!/bin/sh
awk -F " " '{
  for (i = 1; i<=NF; i++)  
    freq[$i]++
}
END{
  for (word in freq)
    printf "%s%d\n",word,freq[word]
}' $1

chmod u+x wordcount_awk.sh
./wordcount_awk.sh words.txt
-----------------------------

NF 表示的是浏览记录的域的个数 
$NF 表示的最后一个Field(列),即输出最后一个字段的内容

[root@localhost SHELL]# free -m | grep buffers\/
-/+ buffers/cache:       1815       1859
[root@localhost SHELL]# free -m | grep buffers\/ | awk '{print $NF}'
1859
[root@localhost SHELL]# free -m | grep buffers\/ | awk '{print NF}'
4
[root@localhost SHELL]# 
--------------------------------
%x代表十六进制
%o是八进制
%d或%i代表十进制整数,
%c是字符
%s是字符串,
%f或%e是输入实数,小数或指数输入都可以
%ld是long double型
%%输入一个百分号。

  javaApi方式实现词频统计:

package cn.WordTongJi;

import java.io.*;
import java.util.HashMap;
import java.util.Map;

/**
 * Created by Administrator on 2018/6/1 0001.
 */
public class WordDemo {
    public static void main(String[] args) throws IOException {
        //读取文件内容,获取文件对象
        BufferedReader br =new  BufferedReader(new FileReader("D:\\test\\aaa.txt"));
        //根据对象获取单词
        String nextLines="";
        
        Map<String,Integer> map = new HashMap<String,Integer>();
        while ((nextLines=br.readLine())!=null){
            //以空格拆分单词,获取到单词数组
            String[] data =nextLines.split(" ");
            //将单词放进Map中,利用for循环,遍历
            for(String word:data){
                //先在循环外定义一个hashmap
                //将单词放进map中
                //<单词,1>的形式
                map.put(word,1);
            }
        }
        //遍历map中的单词
        //KeySet():将Map中所有的键存入到set集合中
        for(String key:map.keySet()){
            //根据key值,计算key值对应的value值
            System.out.println(key+"----"+map.get(key));
        }
    }
}

mapreduce实现词频统计:

package cn.bcqm1711.mr.day01;/**
 * Created by Administrator on 2018/5/2.
 */

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

import java.io.IOException;

/**
 * @author :YongKe.Pan
 * @Desc :  自定义词频统计
 * @create 2018-05-02 9:44
 **/

public class CustomWordCount {


    //MapTask阶段:默认情况下一个数据块对应一个split分片,一个分片对应一个MapTask
    //LongWritable, Text表示的是每一行的偏移量和每一行内容的数据类型
    //Text, IntWritable表示的是每一个map输出key/value的数据类型
    public static class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

        private static final IntWritable one = new IntWritable(1);
        private Text word = new Text();


        //在开始业务代码之前调用一次
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {

        }

        //编写业务逻辑代码,每一行调用一次这个map方法
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //获取每一行的内容
            String line = value.toString();
            //拆分行获取单词
            String[] words = line.split(" ");
            for (String wd : words) {
                word.set(wd);
                //输出到本地磁盘:<单词,1>
                context.write(word, one);
            }
        }

        //业务代码执行完成之后,最后调用一次cleanup
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {

        }
    }

    //ReducerTask阶段
    //Text, IntWritable两个参数接收的是mapTask输出的key/value数据类型
    //Text, IntWritable ReducerTask阶段对接收到的数据业务处理之后输出到hdfs系统的key/value数据类型
    public static class WCReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
        //在开始执行Reduce业务代码之前调用一次
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {

        }

        //key的hashcode码相同的被分配到一个Reduce
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable v : values) {
                sum += v.get();
            }
            //将聚合之后的单词及次数输出到HDFS
            context.write(key, new IntWritable(sum));
        }

        //处理完Reduce业务代码之后用一次
        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {

        }
    }

    //job作业的驱动部分
    public static void main(String[] args) throws Exception {
        //获取配置对象
        Configuration conf = new Configuration();
        //CustomWordCount是作业的名称,可以在历史服务器上方便查看
        //Job job=new Job();
        Job job = Job.getInstance(conf, "CustomWordCount");
        //设置程序的入口类
        job.setJarByClass(CustomWordCount.class);

        //封装MapTask阶段
        job.setMapperClass(WCMapper.class);//设置map阶段的业务处理代码
        job.setMapOutputKeyClass(Text.class);//告诉mr框架map输出key的数据类型
        job.setMapOutputValueClass(IntWritable.class);//告诉mr框架,map输出value的数据类型
        //接收main方法的参数(在提交运行job时传入的参数:/words3.txt)
        FileInputFormat.addInputPath(job,new Path(args[0])); //告诉mr阶段要处理的文件路径

        //封装ReduceTask阶段
        job.setReducerClass(WCReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //将数据输出到hdfs系统的哪个文件(/out0502)
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        job.setPartitionerClass(HashPartitioner.class);

        job.setNumReduceTasks(2);
        //提交job
        boolean isOk = job.waitForCompletion(true);
        System.exit(isOk ? 0 : 1);
    }
}

scala方式实现词频统计:

package cn.qmScala.day04Scala

/**
  * Created by Administrator on 2018/6/2 0002.
  */
object Demo15WordCount {
  val acc =true

  def main(args: Array[String]) {
    val data =Array("jin tian tian qi bu cuo xiang chu qu wan ")
    //拆分出单词.使用flatMap方法
    val words:Array[String]=data.flatMap(_.split(" "))
    //单词->(单词,1)的形式
    val word_one:Array[(String,Int)]=words.map((_,1))
    //分组 
    val groupByWord:Map[String,Array[(String,Int)]]=word_one.groupBy(_._1)
    //1.统计每个单词的个数
    val words_times:Map[String,Int]=groupByWord.mapValues(_.size)
    //for((k,v)<- words_times)println(s"$k,$v")
    //2.按单词出现的次数排序.将单词放进集合中,通过集合的方法来进行排序
    val wordsTimesList:List[(String,Int)]=words_times.toList
    //val wordCountTimeSort:List[(String,Int)]=wordsTimesList.sortBy(_._2)
    val wordCountTimeSort:List[(String,Int)]=wordsTimesList.sortBy(_._2)
  //  for((k,v)<- wordCountTimeSort)println(s"$k,$v")
    //3.求最大的前三个次数最多的单词.....scala的方法
    val wordCountTop3=wordCountTimeSort.take(3)
    for((k,v)<- wordCountTop3)println(s"$k,$v")
  }
}

 

posted @ 2018-06-02 14:34  瓶子xf  阅读(405)  评论(0编辑  收藏  举报