根据用户的注册信息表及用户每日登录平台的数据信息来分析用户7日留存情况

【1】数据 文件 registuser.csv 数据如下

uid,regist_day,regist_os
u1,20200401,android
u2,20200401,iphone
u3,20200401,android
u4,20200402,iphone
u5,20200402,android
u6,20200403,iphone
u7,20200403,iphone
u8,20200404,android
u9,20200404,android

文件 loginInfos.csv 数据如下

uid,login_date
u1,20200401
u1,20200401
u2,20200401
u2,20200401
u3,20200401
u1,20200402
u2,20200402
u3,20200402
u4,20200402
u5,20200402
u1,20200403
u2,20200403
u3,20200403
u4,20200403
u5,20200403
u6,20200403
u6,20200403
u7,20200403
u1,20200404
u2,20200404
u3,20200404
u4,20200404
u5,20200404
u6,20200404
u7,20200404
u8,20200404
u9,20200404
u1,20200405
u2,20200405
u3,20200405
u4,20200406
u5,20200406
u6,20200406
u7,20200407
u8,20200407
u9,20200407
u1,20200408
u3,20200408
u3,20200408

【2】scala代码实现

  1 package com.it.baizhan.scalacode.sparksql.examples
  2 
  3 import org.apache.spark.sql.SparkSession
  4 
  5 /**
  6   * 根据用户的注册信息表及用户每日登录平台的数据信息来分析用户7日留存情况。
  7   *  SQL函数:
  8   *   datediff(日期1,日期2) : 计算日期1与日期2的相差天数,日期格式必须是 “yyyy-MM-dd”格式。
  9   *   unix_timestamp(日期,"yyyyMMdd") : 按照指定的格式将日期数据转换成时间戳。
 10   *   from_unixtime(timestamp,"yyyy-MM-dd") : 将时间戳转换成指定日期格式。
 11   *
 12   */
 13 object UserKeep {
 14   def main(args: Array[String]): Unit = {
 15     val session = SparkSession.builder().master("local").appName("test").getOrCreate()
 16     session.sparkContext.setLogLevel("Error")
 17     val registerInfos = session.read.option("header",true).csv("./data/registuser.csv")
 18     val loginInfos = session.read.option("header",true).csv("./data/loginInfos.csv")
 19 
 20     registerInfos.createTempView("regist")
 21     loginInfos.createTempView("login")
 22 
 23     //对login 表中的数据 相同用户相同日期登录的数据进行去重
 24     session.sql(
 25       """
 26         |select distinct uid,login_date from login
 27       """.stripMargin).createOrReplaceTempView("login")
 28 
 29     /**
 30      * +---+----------+
 31      * |uid|login_date|
 32      * +---+----------+
 33      * | u7|  20200403|
 34      * | u7|  20200407|
 35      * | u4|  20200403|
 36      * | u8|  20200404|
 37      * | u6|  20200404|
 38      * | u1|  20200405|
 39      * | u1|  20200403|
 40      * | u5|  20200402|
 41      * | u4|  20200406|
 42      * | u2|  20200405|
 43      * | u1|  20200404|
 44      * | u3|  20200401|
 45      * | u1|  20200401|
 46      * | u2|  20200404|
 47      * | u2|  20200402|
 48      * | u4|  20200402|
 49      * | u5|  20200404|
 50      * | u3|  20200404|
 51      * | u9|  20200404|
 52      * | u2|  20200403|
 53      * | u3|  20200405|
 54      * | u1|  20200402|
 55      * | u4|  20200404|
 56      * | u6|  20200406|
 57      * | u2|  20200401|
 58      * | u3|  20200402|
 59      * | u3|  20200403|
 60      * | u3|  20200408|
 61      * | u5|  20200406|
 62      * | u6|  20200403|
 63      * | u5|  20200403|
 64      * | u9|  20200407|
 65      * | u8|  20200407|
 66      * | u1|  20200408|
 67      * | u7|  20200404|
 68      * +---+----------+
 69      */
 70 
 71     //用户注册信息表与用户登录信息表进行关联,找出每个用户登录日期与注册日期的差值
 72     session.sql(
 73       """
 74         | select
 75         |   b.uid,b.regist_day,a.login_date,
 76         |   datediff(from_unixtime(unix_timestamp(a.login_date,'yyyyMMdd'),'yyyy-MM-dd'),
 77         |     from_unixtime(unix_timestamp(b.regist_day,'yyyyMMdd'),'yyyy-MM-dd')) as diff
 78         | from login a join regist b
 79         | on a.uid = b.uid
 80       """.stripMargin).createTempView("temp")
 81 
 82     /**
 83      * +---+----------+----------+----+
 84      * |uid|regist_day|login_date|diff|
 85      * +---+----------+----------+----+
 86      * | u7|  20200403|  20200403|   0|
 87      * | u7|  20200403|  20200407|   4|
 88      * | u4|  20200402|  20200403|   1|
 89      * | u8|  20200404|  20200404|   0|
 90      * | u6|  20200403|  20200404|   1|
 91      * | u1|  20200401|  20200405|   4|
 92      * | u1|  20200401|  20200403|   2|
 93      * | u5|  20200402|  20200402|   0|
 94      * | u4|  20200402|  20200406|   4|
 95      * | u2|  20200401|  20200405|   4|
 96      * | u1|  20200401|  20200404|   3|
 97      * | u3|  20200401|  20200401|   0|
 98      * | u1|  20200401|  20200401|   0|
 99      * | u2|  20200401|  20200404|   3|
100      * | u2|  20200401|  20200402|   1|
101      * | u4|  20200402|  20200402|   0|
102      * | u5|  20200402|  20200404|   2|
103      * | u3|  20200401|  20200404|   3|
104      * | u9|  20200404|  20200404|   0|
105      * | u2|  20200401|  20200403|   2|
106      * | u3|  20200401|  20200405|   4|
107      * | u1|  20200401|  20200402|   1|
108      * | u4|  20200402|  20200404|   2|
109      * | u6|  20200403|  20200406|   3|
110      * | u2|  20200401|  20200401|   0|
111      * | u3|  20200401|  20200402|   1|
112      * | u3|  20200401|  20200403|   2|
113      * | u3|  20200401|  20200408|   7|
114      * | u5|  20200402|  20200406|   4|
115      * | u6|  20200403|  20200403|   0|
116      * | u5|  20200402|  20200403|   1|
117      * | u9|  20200404|  20200407|   3|
118      * | u8|  20200404|  20200407|   3|
119      * | u1|  20200401|  20200408|   7|
120      * | u7|  20200403|  20200404|   1|
121      * +---+----------+----------+----+
122      */
123 
124     //统计注册日期的 7 日留存情况
125     session.sql(
126       """
127         | select
128         |   regist_day ,diff ,count(*) as usercount
129         | from temp
130         | where diff != 0
131         | group by regist_day ,diff
132         | order by regist_day,diff
133       """.stripMargin).show()
134 
135     /**
136      * +----------+----+---------+
137      * |regist_day|diff|usercount|
138      * +----------+----+---------+
139      * |  20200401|   1|        3|
140      * |  20200401|   2|        3|
141      * |  20200401|   3|        3|
142      * |  20200401|   4|        3|
143      * |  20200401|   7|        2|
144      * |  20200402|   1|        2|
145      * |  20200402|   2|        2|
146      * |  20200402|   4|        2|
147      * |  20200403|   1|        2|
148      * |  20200403|   3|        1|
149      * |  20200403|   4|        1|
150      * |  20200404|   3|        2|
151      * +----------+----+---------+
152      */
153   }
154 
155 }
posted @ 2021-03-04 16:54  大数据程序员  阅读(271)  评论(0编辑  收藏  举报