3.动态分区的解决方案(将配置文件改成清洗昨天的数据)_将数据导入hive中_项目一

动态分区问题的解决

在dataClean.sh中清洗数据前,创建一个动态时间变量
timeStr=`date -d "yesterday" "+%Y%m%d"`

在dataAnaly.sh中执行
yesterday=`date -d "yesterday" "+%Y%d%m"`
hive --hiveconf yesterday=${yesterday} -f /opt/project/dataClean/Script-1.sql

数据清洗文件

[root@node1 dataClean]# pwd
/opt/project/dataClean
[root@node1 dataClean]# cat dataClean.sh 
#!/bin/bash
# time和=之间不能加空格,变量值需要使用``包括起来
echo "=============================项目数据清洗数据启动成功=============================="
timeStr=`date -d "yesterday" "+%Y%m%d"`
# 拼接一个路径/project/20220809/*
inpath="/project/$timeStr/*"
echo "MR程序的清洗路径定义完成,清洗数据路径为$inpath"
outpath="/dataClean"
hadoop jar /opt/project/dataClean/dataClean.jar DataCleanDriver $inpath $outpath

sql文件

[root@node1 dataClean]# pwd
/opt/project/dataClean
[root@node1 dataClean]# cat Script-1.sql 
create database if not exists project;
use project;

create external table if not exists web_origin(
	ipaddr string comment "ip address",
	visit_time string comment "日志的产生时间",
	request_url string comment "请求的网址",
	status int comment "网站的响应状态码",
	body_bytes int comment "响应字节数",
	referer_url string comment "请求网址的来源网站",
	user_agent string comment "用户的浏览信息",
	province string comment "用户访问时所处的省份",
	latitude string comment "纬度",
	longitude string comment "经度",
	age int comment "年龄"
)
partitioned by(logdate string)
row format delimited
fields terminated by ',';

desc formatted web_origin;

load data inpath "/dataClean/part-m-00000" into table web_origin partition(logdate="${hiveconf:yesterday}");
load data inpath "/dataClean/part-m-00001" into table web_origin partition(logdate="${hiveconf:yesterday}");
select * from web_origin limit 1;

执行脚本,将数据导入sql文件中

[root@node1 dataClean]# pwd
/opt/project/dataClean
[root@node1 dataClean]# cat dataAnaly.sh 
#!/bin/bash
# 定义一个昨天的变量
yesterday=`date -d "yesterday" "+%Y%d%m"`
hive --hiveconf yesterday=${yesterday} -f /opt/project/dataClean/Script-1.sql
posted @ 2022-08-09 16:42  jsqup  阅读(46)  评论(0编辑  收藏  举报