大数据平台跑数shell学习
#!/bin/sh
echo "清空同名文件"
rm -rf bdp_venv.zip
echo "获取环境、脚本、数据等文件"
hdfs dfs -get "/user/0/upload/env.zip"
hdfs dfs -get "/user/0/upload/a.csv"
hdfs dfs -get "/user/0/upload/speed.py"
echo "开始解压,不打印日志"
unzip env.zip > /dev/null
echo "当前路径"
cur_dir=`pwd`
echo "hive取数文件存放位置"
t_path="env/data/t.csv"
s_path="env/data/s.csv"
echo "hivesql"
sql_s="
select
d_code,
id,
start_time,
from_unixtime(cast(start_time as int), 'yyyy/MM/dd HH:mm:ss') s_time,
end_time,
from_unixtime(cast(end_time as int), 'yyyy/MM/dd HH:mm:ss') e_time,
inc_day
from
dm.data p_test
where
inc_day between '20210101' and '20210131'
and d_code like '5%'
order by
start_time
"
echo "hive建表推数据,适用于数据量超大时,hive按行读取慢"
echo "执行sql语句取数"
hive --incremental=true --outputformat=tsv2 -e "$sql_s" >$s_path
echo "getmerge的形式取出数据"
hdfs dfs -getmerge hdfs://**bd/user/hive/warehouse/d.db/exp_new/c_code='0' $t_path
echo "插入第一行表头"
sed -i 1i\un\\id\\tspeed $t_path
echo "开始跑数"
echo "设置环境变量"
export CONDA_HOME=${cur_dir}/env
export PATH=$CONDA_HOME/bin:$PATH
echo "开始运行Python"
chmod 755 env/bin/python3.7
env/bin/python3.7 s.py
echo "结束运行Python"
echo "结果推送至服务器"
HOST='10.118.48.128'
USER='test'
PASSWD='******'
cd env/data/output
echo "要推送的文件"
FILE='res.csv'
echo "服务器目标文件路径,创建以日期命名的文件夹"
DEST=/000/bg_data/`date +%m%d`
ftp -inv $HOST <<END_SCRIPT
quote USER $USER
quote PASS $PASSWD
binary
mkdir $DEST
cd $DEST
mdelete $FILE
mput $FILE
quit
END_SCRIPT
exit 0
-- 这个属性默认值是strict,就是要求分区字段必须有一个是静态的分区值,当前设置为nonstrict,那么可以全部动态分区
set hive.exec.dynamic.partition.mode=nonstrict;
-- 是开启动态分区
set hive.optimize.sort.dynamic.partition=true;
-- 被一条带有动态分区的sql语句所能创建的动态分区总量,如果超出限制会报错
set hive.exec.max.dynamic.partitions=1000;
-- 能被每个mapper或者reducer创建的最大动态分区的数目,如果一个mapper或这reducer试图创建多余这个值的动态分区数目,会引发错误
set hive.exec.max.dynamic.partitions.pernode=1000;
-- 覆盖现有数据插入分区
insert overwrite table exp partition (c_code='0')
select
id,
speed
from
dm.new
where inc_day between 20201222 and 20210224 and c_code='0'