……

第11章 需求一:用户活跃主题

11.1 DWS层

目标:统计当日(dau)、当周、当月活动的每个设备明细

11.1.1 每日活跃设备明细

img

1)建表语句

drop table if exists dws_uv_detail_day;
create external table dws_uv_detail_day
(
  `mid_id` string COMMENT '设备唯一标识',
  `user_id` string COMMENT '用户标识',
  `version_code` string COMMENT '程序版本号',
  `version_name` string COMMENT '程序版本名',
  `lang` string COMMENT '系统语言',
  `source` string COMMENT '渠道号',
  `os` string COMMENT '安卓系统版本',
  `area` string COMMENT '区域',
  `model` string COMMENT '手机型号',
  `brand` string COMMENT '手机品牌',
  `sdk_version` string COMMENT 'sdkVersion',
  `gmail` string COMMENT 'gmail',
  `height_width` string COMMENT '屏幕宽高',
  `app_time` string COMMENT '客户端日志产生时的时间',
  `network` string COMMENT '网络模式',
  `lng` string COMMENT '经度',
  `lat` string COMMENT '纬度'
)
partitioned by(dt string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_day';

2)数据导入

以用户单日访问为key进行聚合,如果某个用户在一天中使用了两种操作系统、两个系统版本、多个地区,登录不同账号,只取其中之一

set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table dws_uv_detail_day
partition(dt='2020-10-14')
select  
  mid_id,
  concat_ws('|', collect_set(user_id)) user_id,
  concat_ws('|', collect_set(version_code)) version_code,
  concat_ws('|', collect_set(version_name)) version_name,
  concat_ws('|', collect_set(lang))lang,
  concat_ws('|', collect_set(source)) source,
  concat_ws('|', collect_set(os)) os,
  concat_ws('|', collect_set(area)) area,
  concat_ws('|', collect_set(model)) model,
  concat_ws('|', collect_set(brand)) brand,
  concat_ws('|', collect_set(sdk_version)) sdk_version,
  concat_ws('|', collect_set(gmail)) gmail,
  concat_ws('|', collect_set(height_width)) height_width,
  concat_ws('|', collect_set(app_time)) app_time,
  concat_ws('|', collect_set(network)) network,
  concat_ws('|', collect_set(lng)) lng,
  concat_ws('|', collect_set(lat)) lat
from dwd_start_log
where dt='2020-10-14'
group by mid_id;

3)查询导入结果

select * from dws_uv_detail_day limit 1;
select count(*) from dws_uv_detail_day;

4)思考

不同渠道来源的每日活跃数统计怎么计算?

11.1.2 每周活跃设备明细

img

根据日用户访问明细,获得周用户访问明细。

1)建表语句

drop table if exists dws_uv_detail_wk;
create external table dws_uv_detail_wk(
  `mid_id` string COMMENT '设备唯一标识',
  `user_id` string COMMENT '用户标识',
  `version_code` string COMMENT '程序版本号',
  `version_name` string COMMENT '程序版本名',
  `lang` string COMMENT '系统语言',
  `source` string COMMENT '渠道号',
  `os` string COMMENT '安卓系统版本',
  `area` string COMMENT '区域',
  `model` string COMMENT '手机型号',
  `brand` string COMMENT '手机品牌',
  `sdk_version` string COMMENT 'sdkVersion',
  `gmail` string COMMENT 'gmail',
  `height_width` string COMMENT '屏幕宽高',
  `app_time` string COMMENT '客户端日志产生时的时间',
  `network` string COMMENT '网络模式',
  `lng` string COMMENT '经度',
  `lat` string COMMENT '纬度',
  `monday_date` string COMMENT '周一日期',
  `sunday_date` string COMMENT  '周日日期'
) COMMENT '活跃用户按周明细'
PARTITIONED BY (`wk_dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_wk/';

2)数据导入

set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table dws_uv_detail_wk partition(wk_dt)
select  
  mid_id,
  concat_ws('|', collect_set(user_id)) user_id,
  concat_ws('|', collect_set(version_code)) version_code,
  concat_ws('|', collect_set(version_name)) version_name,
  concat_ws('|', collect_set(lang)) lang,
  concat_ws('|', collect_set(source)) source,
  concat_ws('|', collect_set(os)) os,
  concat_ws('|', collect_set(area)) area,
  concat_ws('|', collect_set(model)) model,
  concat_ws('|', collect_set(brand)) brand,
  concat_ws('|', collect_set(sdk_version)) sdk_version,
  concat_ws('|', collect_set(gmail)) gmail,
  concat_ws('|', collect_set(height_width)) height_width,
  concat_ws('|', collect_set(app_time)) app_time,
  concat_ws('|', collect_set(network)) network,
  concat_ws('|', collect_set(lng)) lng,
  concat_ws('|', collect_set(lat)) lat,
  date_add(next_day('2020-10-14','MO'),-7),
  date_add(next_day('2020-10-14','MO'),-1),
  concat(date_add( next_day('2020-10-14','MO'),-7), '_' , date_add(next_day('2020-10-14','MO'),-1)
)
from dws_uv_detail_day
where dt>=date_add(next_day('2020-10-14','MO'),-7) and dt<=date_add(next_day('2020-10-14','MO'),-1)
group by mid_id;

3)查询导入结果

select * from dws_uv_detail_wk limit 1;
select count(*) from dws_uv_detail_wk;

11.1.3 每月活跃设备明细

img

1)建表语句

drop table if exists dws_uv_detail_mn;

create external table dws_uv_detail_mn(
  `mid_id` string COMMENT '设备唯一标识',
  `user_id` string COMMENT '用户标识',
  `version_code` string COMMENT '程序版本号',
  `version_name` string COMMENT '程序版本名',
  `lang` string COMMENT '系统语言',
  `source` string COMMENT '渠道号',
  `os` string COMMENT '安卓系统版本',
  `area` string COMMENT '区域',
  `model` string COMMENT '手机型号',
  `brand` string COMMENT '手机品牌',
  `sdk_version` string COMMENT 'sdkVersion',
  `gmail` string COMMENT 'gmail',
  `height_width` string COMMENT '屏幕宽高',
  `app_time` string COMMENT '客户端日志产生时的时间',
  `network` string COMMENT '网络模式',
  `lng` string COMMENT '经度',
  `lat` string COMMENT '纬度'
) COMMENT '活跃用户按月明细'
PARTITIONED BY (`mn` string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_mn/';

2)数据导入

set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table dws_uv_detail_mn partition(mn)
select  
  mid_id,
  concat_ws('|', collect_set(user_id)) user_id,
  concat_ws('|', collect_set(version_code)) version_code,
  concat_ws('|', collect_set(version_name)) version_name,
  concat_ws('|', collect_set(lang)) lang,
  concat_ws('|', collect_set(source)) source,
  concat_ws('|', collect_set(os)) os,
  concat_ws('|', collect_set(area)) area,
  concat_ws('|', collect_set(model)) model,
  concat_ws('|', collect_set(brand)) brand,
  concat_ws('|', collect_set(sdk_version)) sdk_version,
  concat_ws('|', collect_set(gmail)) gmail,
  concat_ws('|', collect_set(height_width)) height_width,
  concat_ws('|', collect_set(app_time)) app_time,
  concat_ws('|', collect_set(network)) network,
  concat_ws('|', collect_set(lng)) lng,
  concat_ws('|', collect_set(lat)) lat,
  date_format('2020-10-14','yyyy-MM')
from dws_uv_detail_day
where date_format(dt,'yyyy-MM') = date_format('2020-10-14','yyyy-MM')
group by mid_id;

3)查询导入结果

select * from dws_uv_detail_mn limit 1;
select count(*) from dws_uv_detail_mn ;

11.1.4 DWS层加载数据脚本

1)创建脚本

[kgg@hadoop102 bin]$ vim dws_uv_log.sh
  在脚本中编写如下内容
#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
   do_date=$1
else
   do_date=`date -d "-1 day" +%F`  
fi
echo "===日志日期为 $do_date==="
sql="
 set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table "$APP".dws_uv_detail_day partition(dt='$do_date')
select  
  mid_id,
  concat_ws('|', collect_set(user_id)) user_id,
  concat_ws('|', collect_set(version_code)) version_code,
  concat_ws('|', collect_set(version_name)) version_name,
  concat_ws('|', collect_set(lang)) lang,
  concat_ws('|', collect_set(source)) source,
  concat_ws('|', collect_set(os)) os,
  concat_ws('|', collect_set(area)) area,
  concat_ws('|', collect_set(model)) model,
  concat_ws('|', collect_set(brand)) brand,
  concat_ws('|', collect_set(sdk_version)) sdk_version,
  concat_ws('|', collect_set(gmail)) gmail,
  concat_ws('|', collect_set(height_width)) height_width,
  concat_ws('|', collect_set(app_time)) app_time,
  concat_ws('|', collect_set(network)) network,
  concat_ws('|', collect_set(lng)) lng,
  concat_ws('|', collect_set(lat)) lat
from "$APP".dwd_start_log
where dt='$do_date'  
group by mid_id;


insert overwrite table "$APP".dws_uv_detail_wk partition(wk_dt)
select  
  mid_id,
  concat_ws('|', collect_set(user_id)) user_id,
  concat_ws('|', collect_set(version_code)) version_code,
  concat_ws('|', collect_set(version_name)) version_name,
  concat_ws('|', collect_set(lang)) lang,
  concat_ws('|', collect_set(source)) source,
  concat_ws('|', collect_set(os)) os,
  concat_ws('|', collect_set(area)) area,
  concat_ws('|', collect_set(model)) model,
  concat_ws('|', collect_set(brand)) brand,
  concat_ws('|', collect_set(sdk_version)) sdk_version,
  concat_ws('|', collect_set(gmail)) gmail,
  concat_ws('|', collect_set(height_width)) height_width,
  concat_ws('|', collect_set(app_time)) app_time,
  concat_ws('|', collect_set(network)) network,
  concat_ws('|', collect_set(lng)) lng,
  concat_ws('|', collect_set(lat)) lat,
  date_add(next_day('$do_date','MO'),-7),
  date_add(next_day('$do_date','MO'),-1),
  concat(date_add( next_day('$do_date','MO'),-7), '_' , date_add(next_day('$do_date','MO'),-1)
)
from "$APP".dws_uv_detail_day
where dt>=date_add(next_day('$do_date','MO'),-7) and dt<=date_add(next_day('$do_date','MO'),-1)
group by mid_id;


insert overwrite table "$APP".dws_uv_detail_mn partition(mn)
select
  mid_id,
  concat_ws('|', collect_set(user_id)) user_id,
  concat_ws('|', collect_set(version_code)) version_code,
  concat_ws('|', collect_set(version_name)) version_name,
  concat_ws('|', collect_set(lang))lang,
  concat_ws('|', collect_set(source)) source,
  concat_ws('|', collect_set(os)) os,
  concat_ws('|', collect_set(area)) area,
  concat_ws('|', collect_set(model)) model,
  concat_ws('|', collect_set(brand)) brand,
  concat_ws('|', collect_set(sdk_version)) sdk_version,
  concat_ws('|', collect_set(gmail)) gmail,
  concat_ws('|', collect_set(height_width)) height_width,
  concat_ws('|', collect_set(app_time)) app_time,
  concat_ws('|', collect_set(network)) network,
  concat_ws('|', collect_set(lng)) lng,
  concat_ws('|', collect_set(lat)) lat,
  date_format('$do_date','yyyy-MM')
from "$APP".dws_uv_detail_day
where date_format(dt,'yyyy-MM') = date_format('$do_date','yyyy-MM')  
group by mid_id;
"

$hive -e "$sql"

2)增加脚本执行权限

chmod 777 dws_uv_log.sh

3)脚本使用

dws_uv_log.sh 2019-02-11

4)查询结果

select count(*) from dws_uv_detail_day where dt='2019-02-11';
select count(*) from dws_uv_detail_wk;
select count(*) from dws_uv_detail_mn ;

5)脚本执行时间

企业开发中一般在每日凌晨30分~1点

 

11.2 ADS层

目标:当日、当周、当月活跃设备数

11.2.1 活跃设备数

img

1)建表语句

drop table if exists ads_uv_count;
create external table ads_uv_count(
  `dt` string COMMENT '统计日期',
  `day_count` bigint COMMENT '当日用户数量',
  `wk_count`  bigint COMMENT '当周用户数量',
  `mn_count`  bigint COMMENT '当月用户数量',
  `is_weekend` string COMMENT 'Y,N是否是周末,用于得到本周最终结果',
  `is_monthend` string COMMENT 'Y,N是否是月末,用于得到本月最终结果'
) COMMENT '活跃设备数'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_uv_count/';

2)导入数据

insert into table ads_uv_count 
select  
 '2020-10-14' dt,
  daycount.ct,
  wkcount.ct,
  mncount.ct,
if(date_add(next_day('2020-10-14','MO'),-1)='2020-10-14','Y','N') ,
  if(last_day('2020-10-14')='2020-10-14','Y','N')
from
(
  select  
     '2020-10-14' dt,
      count(*) ct
  from dws_uv_detail_day
  where dt='2020-10-14'  
)daycount join
(
  select  
    '2020-10-14' dt,
    count (*) ct
  from dws_uv_detail_wk
  where wk_dt=concat(date_add(next_day('2020-10-14','MO'),-7),'_' ,date_add(next_day('2020-10-14','MO'),-1) )
) wkcount on daycount.dt=wkcount.dt
join
(
  select  
    '2020-10-14' dt,
    count (*) ct
  from dws_uv_detail_mn
  where mn=date_format('2020-10-14','yyyy-MM')  
)mncount on daycount.dt=mncount.dt;

3)查询导入结果

select * from ads_uv_count ;

11.2.2 ADS层加载数据脚本

1)创建脚本

[kgg@hadoop102 bin]$ vim ads_uv_log.sh
  在脚本中编写如下内容
#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
   do_date=$1
else
   do_date=`date -d "-1 day" +%F`  
fi
echo "===日志日期为 $do_date==="
sql="
 set hive.exec.dynamic.partition.mode=nonstrict;

insert into table "$APP".ads_uv_count
select  
 '$do_date' dt,
  daycount.ct,
  wkcount.ct,
  mncount.ct,
  if(date_add(next_day('$do_date','MO'),-1)='$do_date','Y','N') ,
  if(last_day('$do_date')='$do_date','Y','N')
from
(
  select  
     '$do_date' dt,
      count(*) ct
  from "$APP".dws_uv_detail_day
  where dt='$do_date'  
)daycount   join
(
  select  
    '$do_date' dt,
    count (*) ct
  from "$APP".dws_uv_detail_wk
  where wk_dt=concat(date_add(next_day('$do_date','MO'),-7),'_' ,date_add(next_day('$do_date','MO'),-1) )
) wkcount on daycount.dt=wkcount.dt
join
(
  select  
    '$do_date' dt,
    count (*) ct
  from "$APP".dws_uv_detail_mn
  where mn=date_format('$do_date','yyyy-MM')  
)mncount on daycount.dt=mncount.dt;
"

$hive -e "$sql"

2)增加脚本执行权限

chmod 777 ads_uv_log.sh

3)脚本使用

 ads_uv_log.sh 2019-02-11

4)脚本执行时间

企业开发中一般在每日凌晨30分~1点

5)查询导入结果

 select * from ads_uv_count;

 

 posted on 2021-01-20 16:13  大码王  阅读(324)  评论(0编辑  收藏  举报
复制代码