hive数据仓库建设

hive数据仓库建设

1、设计原生日志表

原生日志表用来存放上报的原始日志,数据经过清洗加工后会进入到各个日志表中。

1.1 创建数据库

#创建数据库
$hive>create database umeng_big11 ;

1.2 创建原生日志表

原生表使用分区表设计,分区字段为ym/d/hm,hive使用动态分区表,分区采用非严格模式,即所有分区都可以是动态分区。hive命令行终端打开显式表头设置:

#临时设置,只在当前回话有效
$hive>set hive.cli.print.header=true ;

永久配置hive-site.xml:

...
<property>
  <name>hive.cli.print.header</name>
  <value>true</value>
</property>
...

创建hive原生日志表raw_logs:

$hive>create table raw_logs
(
  servertimems	float ,
  servertimestr string , 
  clientip		string ,
  clienttimems 	bigint,
  status 		int ,
  log 			string 
)
PARTITIONED BY (ym int, day int , hm int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

1.3 加载hdfs数据到hive原生表

$hive>use umeng_big11 ;
$hive>load data inpath '/user/centos/umeng/raw-logs/201805/24/1809' into table raw_logs partition(ym=201805 , day = 24 , hm = 1809) ;

2、自定义UDF函数完成数据清洗

2.1 介绍

将原生数据加载原生日志表后,将status码为200的记录查询出来进行清洗,将结果分别插入到5类日志子表中。供以后分析使用。

3、创建日志子表

3.1 准备sql语句

创建/home/centos/umeng/umeng_create_logs_ddl.sql文件,内容如下:

--使用指定库
use umeng_big11 ;

--startuplogs
create table if not exists startuplogs
(
  appChannel 	string ,
  appId			string ,
  appPlatform	string ,
  appVersion	string ,
  brand			string ,
  carrier 		string ,
  country 		string ,
  createdAtMs 	bigint ,
  deviceId 		string ,
  deviceStyle 	string ,
  ipAddress 	string ,
  network		string ,
  osType		string ,
  province		string ,
  screenSize	string ,
  tenantId		string
) 
partitioned by (ym int ,day int , hm int) 
stored as parquet ; 

--eventlogs
create table if not exists eventlogs
(
  appChannel		string ,
  appId 			string ,
  appPlatform		string ,
  appVersion		string ,
  createdAtMs		bigint ,
  deviceId			string ,
  deviceStyle		string ,
  eventDurationSecs bigint ,
  eventId 			string ,
  osType 			string ,
  tenantId 			string
) 
partitioned by (ym int ,day int , hm int) 
stored as parquet ; 

--errorlogs
create table if not exists errorlogs
(
  appChannel 	string ,
  appId 		string ,
  appPlatform 	string ,
  appVersion 	string ,
  createdAtMs 	bigint ,
  deviceId 		string ,
  deviceStyle 	string ,
  errorBrief 	string ,
  errorDetail 	string ,
  osType 		string ,
  tenantId 		string
) 
partitioned by (ym int ,day int , hm int) 
stored as parquet ; 

--usagelogs
create table if not exists usgaelogs
(
  appChannel 			string ,
  appId 				string ,
  appPlatform 			string ,
  appVersion 			string ,
  createdAtMs 			bigint ,
  deviceId 				string ,
  deviceStyle 			string ,
  osType 				string ,
  singleDownloadTraffic bigint ,
  singleUploadTraffic	bigint ,
  singleUseDurationSecs	bigint ,
  tenantId 				string
) 
partitioned by (ym int ,day int , hm int) 
stored as parquet ; 

--pagelogs
create table if not exists pagelogs
(
  appChannel 			string ,
  appId 				string ,
  appPlatform 			string ,
  appVersion 			string ,
  createdAtMs 			bigint ,
  deviceId 				string ,
  deviceStyle 			string ,
  nextPage 				string ,
  osType 				string ,
  pageId 				string ,
  pageViewCntInSession	int ,
  stayDurationSecs		bigint ,
  tenantId 				string ,
  visitIndex 			int
) 
partitioned by (ym int ,day int , hm int) 
stored as parquet ; 

3.2 执行sql脚本

$hive>source /home/centos/umeng/umeng_create_logs_ddl.sql
posted @ 2018-08-28 00:59  大道至简(老徐)  阅读(567)  评论(0编辑  收藏  举报