scala实现wordcount方法-商品标签统计-气温统计
scala实现单词统计 --------------------- import scala.io.Source /** * Created by Administrator on 2018/5/7. */ object WCApp { def main(args: Array[String]): Unit = { //1.加载文件 val src = Source.fromFile("d:/mr/word.txt") //2.取得所有行 val lines = src.getLines().toList // for(line <- lines){ // println(line) // } //3.压扁单词 val words = lines.flatMap(_.split(" ")) //标一成对 val map1 = words.map((w:String) => (w, 1)) //按照单词分组 val map2 = map1.groupBy(t=>t._1) val map3 = map2.mapValues(list => list.size) lines.foreach(println) } } scala实现单词统计2 --------------------- import scala.io.Source /** * Created by Administrator on 2018/5/7. */ object WCApp2 { def main(args: Array[String]): Unit = { //1.加载文件 val src = Source.fromFile("d:/mr/word.txt") //2.取得所有行 val lines = src.getLines().toList // for(line <- lines){ // println(line) // } //3.压扁单词 val words = lines.flatMap(_.split(" ")) //标一成对 val map1 = words.map((w:String) => (w, 1)) //按照单词分组{hello->[(hello,1),(hello,1),(hello,1)]} val map2 = map1.groupBy(t=>t._1) //{hello->(hello,4) , ...} val map3 = map2.mapValues(list => { // def op(a:Tuple2[String,Int] , b:Tuple2[String,Int]) = { // val word = a._1 // val cnt = a._2 + b._2 // (word , cnt) // } // list.reduce(op _) list.reduce((a,b)=>(a._1,a._2 + b._2 )) }) // val map4 = map3.map((t:Tuple2[String,Tuple2[String,Int]])=>t._2) map4.foreach(println) } } Bitmap实现topn统计 ------------------------ import scala.io.Source /** * 气温的年度内topN查询,使用reduce实现 */ object TempTopN2_Bitmap { def main(args: Array[String]): Unit = { //1.加载气温文件 val f = Source.fromFile("d:/mr/temp.dat") //2.取得所有行 val temps = f.getLines().toList //3.提取每行的年度和气温,形成元组{(1900,28),....} val map1 = temps.map((line:String) => { val arr = line.split(" ") val year = arr(0).toInt val temp = arr(1).toInt (year, temp) }) //4.按照年度分组{(1920->{(),(),(),...}),...} val map2 = map1.groupBy((t:Tuple2[Int,Int])=>t._1) //5.对每个key对应的value进行按照气温只top3聚合 val map3 = map2.mapValues(list=>{ val bytes = list.foldLeft(new Array[Byte](128))((a,b)=>{ val temp = b._2 if(temp > 0){ val index = temp / 8 val mod = temp % 8 a(index) = (a(index) | (1 << mod)).toByte } a }) //定义方法,处理bitmap def process(): String ={ var count = 0; var tempStr = ""; for (x <- (0 until bytes.length).reverse) { val b = bytes(x) for (y <- (0 to 7).reverse) { if (((b >> y) & 1) != 0) { count += 1 tempStr = tempStr + "," + (8 * x + y) if (count == 3) { return tempStr } } } } tempStr } process() }) val map4 = map3.toList.sortBy(e=>e._1) map4.foreach(println(_)) } } scala实现商品评论 --------------------- 1.TagUtil.java package com.oldboy.scala.util; import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import java.util.ArrayList; import java.util.List; /** * 标签工具类 */ public class TagUtil { /** * 从json数据中抽取出评论集合 */ public static List<String> extractTags(String json){ //评论集合 List<String> tags = new ArrayList<String>() ; //将文件解析成json对象 JSONObject obj = JSON.parseObject(json) ; //得到数组 JSONArray array = obj.getJSONArray("extInfoList"); //判断数组有效性 if(array != null && array.size() > 0){ JSONObject obj2 = array.getJSONObject(0); JSONArray arr2 = obj2.getJSONArray("values") ; if(arr2 != null && arr2.size() > 0 ){ for(int i = 0 ; i < arr2.size() ; i ++){ tags.add(arr2.getString(i)); } } } return tags ; } } 2.TaggenDemo import javax.swing.text.html.HTML.Tag import com.oldboy.scala.util.TagUtil import scala.io.Source /** * 便签生成统计 */ object TaggenDemo { def main(args: Array[String]): Unit = { //1.加载文件 val file = Source.fromFile("d:/mr/temptags.txt") ; //2.提取所有行 val lines = file.getLines().toList //3.压扁变换每行形成(busid,tag) val map1 = lines.flatMap(line=>{ var list0:List[(String,String)] = Nil var arr = line.split("\t") val busid = arr(0) var json = arr(1) import scala.collection.JavaConversions._ val list:List[String] = TagUtil.extractTags(json).toList ; for(tag <- list){ list0 = (busid, tag) +: list0 } list0 }) //4.对元组进行分组,{(busid,tag)->List((busid,tag),(busid,tag),...} val map2 = map1.groupBy(t => t) //5.统计每个key下List的size,{(busid,tag)->300} val map3 = map2.mapValues(_.size) //6.交换元素位置,List((busid , (tag,cnt)),...) val map4 = map3.toList.map(t=>(t._1._1 , (t._1._2,t._2))) //7.按照busid再次分组Map(busid->List((busid , (tag,cnt)),...)) val map5 = map4.groupBy(t=>t._1) //8.对每个商家内的评论按照数量倒排序.Map(busid->List((busid,(tag,59))) val map6 = map5.mapValues(list=>{ val list2 = list.sortBy(t=> -t._2._2).take(5) val list3 = list2.map(t=>t._2) list3 }) //9.对商家进行排序,按照商家的最大评论数倒排序 val map7 = map6.toList.sortBy(t=> -t._2(0)._2) map7.foreach(t=>{ val busid = t._1 val str = t._2.mkString(";") println(busid + "==>" + str) }) } }