使用Spark Core改写SQL语句实现的功能

对于一张数据表,如下图所示

对上述表实现分组统计查询: select pageid,age,count(1) from pv_users group by pageid,age;

将上述的SQL查询语句改写成Spark的代码,如下:

package com.company.sparkcore

import org.apache.spark.{SparkConf, SparkContext}

object CountPVByGroup {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setAppName(CountPVByGroup.getClass.getSimpleName)
      .setMaster("local")
//    Logger.getLogger("org.apache.spark").setLevel(Level.OFF)
//    Logger.getLogger("org.apache.hadoop").setLevel(Level.OFF)
    val sc = new SparkContext(conf)
    val lines = sc.textFile("file:///e:/pv_users.txt")
    //拼接成(1_25,1)的形式
    val newKeyValue =  lines.map(_.split(",")).map(pvdata => ((pvdata(0)+ "_" + pvdata(1)),1))
   //对上述KV进行统计
    val pvcount = newKeyValue.reduceByKey(_ + _)
    //将拼接符号去掉,组合成形如(1,25,1)的形式
    val pvid_age_count = pvcount.map(newkv => (newkv._1.split("_")(0),newkv._1.split("_")(1),newkv._2))
    //结果输出
//    (1,25,1)
//    (2,25,2)
//    (1,32,1)
    pvid_age_count.collect().foreach(println)
  }

}

 

posted @ 2018-11-13 10:43  大数据技术与数仓  阅读(222)  评论(0编辑  收藏  举报