spark算子练习

  def main(args: Array[String]): Unit = {
    val session = SparkSession.builder().master("local[*]").appName("exam2").getOrCreate()
    import session.implicits._
    val df = session.read.option("header", true).csv("hdfs://192.168.131.200:9000/app/data/exam/returned_goods_log_7-9.csv")
    //    df.createOrReplaceTempView("sales")

    //    session.sql("""select distinct 订单号 from sales""").show()

    /* todo: 1统计产生售后服务的订单号的个数 */

    //    df.dropDuplicates("订单号").agg(count("订单号")).show()  // 3362

    //    df.agg(countDistinct("订单号")).show()  // 3362

    /* todo: 2统计每个月分别产生售后服务的不同订单个数 */

    //    df.groupBy("月份").agg(countDistinct("订单号")).show()

    /* todo: 3统计产生多次售后服务的订单及对应的售后服务次数 */

    //    df.groupBy("订单号").agg(count("订单号").as("cnt"))
    //      .where("cnt>1").select("订单号", "cnt").show()

    /* todo: 4统计每个月每种售后服务问题类型占当月总售后服务量的占比(保留小数点后 3 位)，
         输出结果模式（月份，问题类型，占比）*/

    //        val temp_cnt = df.groupBy("月份").agg(count("订单号").as("cnt"))
    //          .select("月份", "cnt")
    //
    //        val temp = df.groupBy("月份", "关联(问题类型)").agg(count("订单号").as("month_cnt"))
    //          .select("月份", "关联(问题类型)","month_cnt")
    //
    //
    //        temp.join(temp_cnt,Seq("月份"),"inner")
    //            .select($"月份",$"关联(问题类型)",round($"month_cnt"/$"cnt",3).as("占比"))
    //            .show()

    /* todo: 5统计每个月每个员工处理的任务状态为已完成状态的售后服务数量 */

//    df.where("任务状态='已完成'").groupBy($"月份", $"任务执行人").agg(countDistinct("订单号").as("cnt_task"))
//      .where("执行人!=''").select($"月份", $"执行人", $"cnt_task").show()


    session.stop()

  }

def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("userbehavior").master("local[*]").getOrCreate()
    import spark.implicits._
    var df = spark.read.csv("hdfs://192.168.131.200:9000/data/userbehavior/UserBehavior.csv")

    df = df.withColumnRenamed("_c0", "user_id")
      .withColumnRenamed("_c1", "item_id")
      .withColumnRenamed("_c2", "category_id")
      .withColumnRenamed("_c3", "behavior_type")
      .withColumnRenamed("_c4", "time")

    /* todo: 统计 uv 值（一共有多少用户访问淘宝）*/

    //    df.groupBy("user_id").count().agg(count("count")).show()   // 5458


    /* todo: 分别统计浏览行为为点击，收藏，加入购物车，购买的总数量 */

    //    val pv_cnt = df.where("behavior_type='pv'").groupBy("behavior_type")
    //      .agg(count("behavior_type"))   //  503881

    //    val buy_cnt = df.where("behavior_type='buy'").groupBy("behavior_type")
    //      .agg(count("behavior_type"))   //  11508


    //    val cart_cnt = df.where("behavior_type='cart'").groupBy("behavior_type")
    //      .agg(count("behavior_type"))   //  30888

    //    val fav_cnt = df.where("behavior_type='fav'").groupBy("behavior_type")
    //      .agg(count("behavior_type"))   //   15017

    //    df.groupBy("behavior_type").agg(count("behavior_type")).show()


    val df1 = df.where("behavior_type='buy'")
      .withColumn("buytime", from_unixtime($"time", "yyyy-MM-dd"))
      .where("buytime>='2017-11-03' and buytime<='2017-12-03'")
      .cache()

    //    df1.withColumn("daycha",
    //      when(datediff(lit("2017-12-03"), $"buytime") <= lit(6), 4)
    //        .when(datediff(lit("2017-12-03"), $"buytime") <= lit(12), 3)
    //        .when(datediff(lit("2017-12-03"), $"buytime") <= lit(18), 2)
    //        .when(datediff(lit("2017-12-03"), $"buytime") <= lit(24), 1)
    //        .otherwise(0))
    //      .show()

    df1.groupBy("user_id").agg(count("item_id").as("pay_cnt"))
      .withColumn("level",
        when($"pay_cnt" <= lit(32), 0)
          .when($"pay_cnt" <= lit(64), 1)
          .when($"pay_cnt" <= lit(96), 2)
          .when($"pay_cnt" <= lit(128), 3)
          .otherwise(4)
      ).orderBy(desc("level")).show()

  }

posted on 2021-12-29 13:40 理想三旬_z 阅读(117) 评论(0) 编辑收藏举报

刷新页面返回顶部

spark算子练习

导航

公告