javaApi,mapreduce,awk,scala四种方式实现词频统计
awk方式实现词频统计:
方式一: vi wordcount.awk { for (i = 1; i <=NF;i++) //NF 表示的是浏览记录的域的个数 freq[$i]++ } END{ for(word in freq) // printf "%s%d\n",word,freq[word] // } 运行:awk -f wordcount.awk words.txt; ---------------------------------- 方式二: 运行方式二: vi wordcount_awk.sh #!/bin/sh awk -F " " '{ for (i = 1; i<=NF; i++) freq[$i]++ } END{ for (word in freq) printf "%s%d\n",word,freq[word] }' $1 chmod u+x wordcount_awk.sh ./wordcount_awk.sh words.txt ----------------------------- NF 表示的是浏览记录的域的个数 $NF 表示的最后一个Field(列),即输出最后一个字段的内容 [root@localhost SHELL]# free -m | grep buffers\/ -/+ buffers/cache: 1815 1859 [root@localhost SHELL]# free -m | grep buffers\/ | awk '{print $NF}' 1859 [root@localhost SHELL]# free -m | grep buffers\/ | awk '{print NF}' 4 [root@localhost SHELL]# -------------------------------- %x代表十六进制 %o是八进制 %d或%i代表十进制整数, %c是字符 %s是字符串, %f或%e是输入实数,小数或指数输入都可以 %ld是long double型 %%输入一个百分号。
javaApi方式实现词频统计:
package cn.WordTongJi; import java.io.*; import java.util.HashMap; import java.util.Map; /** * Created by Administrator on 2018/6/1 0001. */ public class WordDemo { public static void main(String[] args) throws IOException { //读取文件内容,获取文件对象 BufferedReader br =new BufferedReader(new FileReader("D:\\test\\aaa.txt")); //根据对象获取单词 String nextLines=""; Map<String,Integer> map = new HashMap<String,Integer>(); while ((nextLines=br.readLine())!=null){ //以空格拆分单词,获取到单词数组 String[] data =nextLines.split(" "); //将单词放进Map中,利用for循环,遍历 for(String word:data){ //先在循环外定义一个hashmap //将单词放进map中 //<单词,1>的形式 map.put(word,1); } } //遍历map中的单词 //KeySet():将Map中所有的键存入到set集合中 for(String key:map.keySet()){ //根据key值,计算key值对应的value值 System.out.println(key+"----"+map.get(key)); } } }
mapreduce实现词频统计:
package cn.bcqm1711.mr.day01;/** * Created by Administrator on 2018/5/2. */ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; import java.io.IOException; /** * @author :YongKe.Pan * @Desc : 自定义词频统计 * @create 2018-05-02 9:44 **/ public class CustomWordCount { //MapTask阶段:默认情况下一个数据块对应一个split分片,一个分片对应一个MapTask //LongWritable, Text表示的是每一行的偏移量和每一行内容的数据类型 //Text, IntWritable表示的是每一个map输出key/value的数据类型 public static class WCMapper extends Mapper<LongWritable, Text, Text, IntWritable> { private static final IntWritable one = new IntWritable(1); private Text word = new Text(); //在开始业务代码之前调用一次 @Override protected void setup(Context context) throws IOException, InterruptedException { } //编写业务逻辑代码,每一行调用一次这个map方法 @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //获取每一行的内容 String line = value.toString(); //拆分行获取单词 String[] words = line.split(" "); for (String wd : words) { word.set(wd); //输出到本地磁盘:<单词,1> context.write(word, one); } } //业务代码执行完成之后,最后调用一次cleanup @Override protected void cleanup(Context context) throws IOException, InterruptedException { } } //ReducerTask阶段 //Text, IntWritable两个参数接收的是mapTask输出的key/value数据类型 //Text, IntWritable ReducerTask阶段对接收到的数据业务处理之后输出到hdfs系统的key/value数据类型 public static class WCReduce extends Reducer<Text, IntWritable, Text, IntWritable> { //在开始执行Reduce业务代码之前调用一次 @Override protected void setup(Context context) throws IOException, InterruptedException { } //key的hashcode码相同的被分配到一个Reduce @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable v : values) { sum += v.get(); } //将聚合之后的单词及次数输出到HDFS context.write(key, new IntWritable(sum)); } //处理完Reduce业务代码之后用一次 @Override protected void cleanup(Context context) throws IOException, InterruptedException { } } //job作业的驱动部分 public static void main(String[] args) throws Exception { //获取配置对象 Configuration conf = new Configuration(); //CustomWordCount是作业的名称,可以在历史服务器上方便查看 //Job job=new Job(); Job job = Job.getInstance(conf, "CustomWordCount"); //设置程序的入口类 job.setJarByClass(CustomWordCount.class); //封装MapTask阶段 job.setMapperClass(WCMapper.class);//设置map阶段的业务处理代码 job.setMapOutputKeyClass(Text.class);//告诉mr框架map输出key的数据类型 job.setMapOutputValueClass(IntWritable.class);//告诉mr框架,map输出value的数据类型 //接收main方法的参数(在提交运行job时传入的参数:/words3.txt) FileInputFormat.addInputPath(job,new Path(args[0])); //告诉mr阶段要处理的文件路径 //封装ReduceTask阶段 job.setReducerClass(WCReduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //将数据输出到hdfs系统的哪个文件(/out0502) FileOutputFormat.setOutputPath(job,new Path(args[1])); job.setPartitionerClass(HashPartitioner.class); job.setNumReduceTasks(2); //提交job boolean isOk = job.waitForCompletion(true); System.exit(isOk ? 0 : 1); } }
scala方式实现词频统计:
package cn.qmScala.day04Scala /** * Created by Administrator on 2018/6/2 0002. */ object Demo15WordCount { val acc =true def main(args: Array[String]) { val data =Array("jin tian tian qi bu cuo xiang chu qu wan ") //拆分出单词.使用flatMap方法 val words:Array[String]=data.flatMap(_.split(" ")) //单词->(单词,1)的形式 val word_one:Array[(String,Int)]=words.map((_,1)) //分组 val groupByWord:Map[String,Array[(String,Int)]]=word_one.groupBy(_._1) //1.统计每个单词的个数 val words_times:Map[String,Int]=groupByWord.mapValues(_.size) //for((k,v)<- words_times)println(s"$k,$v") //2.按单词出现的次数排序.将单词放进集合中,通过集合的方法来进行排序 val wordsTimesList:List[(String,Int)]=words_times.toList //val wordCountTimeSort:List[(String,Int)]=wordsTimesList.sortBy(_._2) val wordCountTimeSort:List[(String,Int)]=wordsTimesList.sortBy(_._2) // for((k,v)<- wordCountTimeSort)println(s"$k,$v") //3.求最大的前三个次数最多的单词.....scala的方法 val wordCountTop3=wordCountTimeSort.take(3) for((k,v)<- wordCountTop3)println(s"$k,$v") } }
成就人