[源码分享] HIVE表数据量统计&邮件
概要:
计算HIVE BI库下每天数据表总大小及增量
输出:
总大小:xxxG 日同比新增数据量:xxxG 周同比新增数据量:xxxG 月同比新增数据量:xxxG 总表数:xxx 日新增表数:xxx 周新增表数:xxx 月新增表数:xxx 最大的20张表: ...... 表数据增量TOP20: ......
代码:
bi_report.sh
#!/bin/bash GIGA=1000000000 content="\n" prefix='bi' today=`date -d"-1 day" +%Y-%m-%d` yestoday=`date -d"-2 day" +%Y-%m-%d` lastweek=`date -d"-8 day" +%Y-%m-%d` lastmonth=`date -d"-1 month -1 day" +%Y-%m-%d` hadoop fs -du /user/hive/warehouse/bi.db/ > ./bi.db.stat awk '{if(NR!=1) {size=split($2,table_list,"/"); print table_list[size],"\t",$1}}' ./bi.db.stat > $prefix.$today sum=`awk '{sum+=$2} END{print sum/"'$GIGA'"}' $prefix.$today` content+="总大小:"$sum"G\n" yes_sum=`awk '{sum+=$2} END{print sum/"'$GIGA'"}' $prefix.$yestoday` diff_size=`echo "$sum-$yes_sum" | bc` content+="日同比新增数据量:"$diff_size"G\n" week_sum=`awk '{sum+=$2} END{print sum/"'$GIGA'"}' $prefix.$lastweek` diff_size=`echo "$sum-$week_sum" | bc` content+="周同比新增数据量:"$diff_size"G\n" month_sum=`awk '{sum+=$2} END{print sum/"'$GIGA'"}' $prefix.$lastmonth` diff_size=`echo "$sum-$month_sum" | bc` content+="月同比新增数据量:"$diff_size"G\n" table_count=`wc -l $prefix.$today | awk '{print $1}'` content+="\n\n总表数:"$table_count"\n" yes_table_count=`wc -l $prefix.$yestoday | awk '{print $1}'` diff_table_count=`echo "$table_count-$yes_table_count" | bc` content+="日新增表数:"$diff_table_count"\n" lastweek_table_count=`wc -l $prefix.$lastweek | awk '{print $1}'` diff_table_count=`echo "$table_count-$lastweek_table_count" | bc` content+="周新增表数:"$diff_table_count"\n" lastmonth_table_count=`wc -l $prefix.$lastmonth | awk '{print $1}'` diff_table_count=`echo "$table_count-$lastmonth_table_count" | bc` content+="月新增表数:"$diff_table_count"\n" max_table="`sort -r -n -k 2 $prefix.$today | head -20 | awk '{ORS="";print $1":" $2/"'$GIGA'""G\\\\n"}'`" content+="\n\n最大的20张表:\n"$max_table"\n" declare -A ARR1 while read line do key=`echo $line|awk '{print $1}'` value=`echo $line|awk '{print $2}'` ARR1["$key"]=$value done < $prefix.$today declare -A ARR2 while read line do key=`echo $line|awk '{print $1}'` value=`echo $line|awk '{print $2}'` ARR2["$key"]=$value done < $prefix.$yestoday for k in "${!ARR1[@]}" do d_val=`echo "${ARR1[${k}]}-${ARR2[${k}]}"|bc` ARR1["$k"]=$d_val done cat /dev/null > bi_tmp.txt for k in "${!ARR1[@]}" do echo "$k ${ARR1[${k}]}" >> bi_tmp.txt done max_incre_table="`sort -r -n -k 2 bi_tmp.txt | head -20 | awk '{ORS="";print $1":" $2/"'$GIGA'""G\\\\n"}'`" content+="\n\n表数据增量TOP20:\n"$max_incre_table"\n" #echo -e $content msg=`echo $content` python mail.py "HIVE BI每日统计" $msg
mail.py
#! /usr/bin/python import sys import optparse import os import smtplib from email.mime.text import MIMEText ## mail server config mail_host="mail.51ping.com" mail_user="info" mail_pass="chinese1" mail_postfix="51ping.com" ###################### def send_mail(to_list,sub,content): me=mail_user+"<"+mail_user+"@"+mail_postfix+">" msg = MIMEText(content) msg['Subject'] = sub msg['From'] = me msg['To'] = ";".join(to_list) try: s = smtplib.SMTP() s.connect(mail_host) s.login(mail_user,mail_pass) s.sendmail(me, to_list, msg.as_string()) s.close() return True except Exception, e: print str(e) return False mail_to_list=["zxmever@gmail.com"] msg=sys.argv[2] msg=msg.replace("\\n","\r\n") print msg send_mail(mail_to_list,sys.argv[1],msg)
执行:
sh bi_report.sh