相关显示参数设置
set hive.cli.print.header=true; // 打印列名
set hive.cli.print.row.to.vertical=true; // 开启行转列功能, 前提必须开启打印列名功能
set hive.cli.print.row.to.vertical.num=1; // 设置每行显示的列数
set hive.exec.dynamic.partition=true;使用动态分区
set hive.exec.dynamic.partition.mode=nonstrict;
SET hive.exec.max.dynamic.partitions=100000;设置允许最大动态分区的数量
SET hive.exec.max.dynamic.partitions.pernode=100000;
创建数据库
CREATE DATABASE [IF NOT EXISTS] userdb;
建表
create table user_summary_with_tel (
bill_time string comment '订单时间',
tel string comment '订购号码',
bill_addr string comment '配送地址',
county string comment '区县',
street string comment '街道',
community string comment '社区',
campus string comment '校园',
company string comment '公司',
building string comment '大厦',
residential_district string comment '住宅小区',
industrial_district string comment '工业区',
city_village string comment '城中村',
address string comment '地址名称',
attribute string comment '属性',
return_address string comment '返回地址',
ordermonth string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
create table tb_class_info
(id int,
createtime timestamp ,
modifytime timestamp)
PARTITIONED BY (day string, class_name string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS TEXTFILE;
CREATE TABLE use_resident(
tel string,
bill_addr string,
address string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
本地数据写入hive表
load data local inpath '/data/use_resident.csv' overwrite into table use_resident;
load data local inpath '/home/hadoop/tbclass.txt' overwrite into table tb_class_info partition(day='2017-06-01', class_name=val2);
查询结果存储
insert overwrite table tel_imsi_9551 select distinct(tel, imsi) from sgs_data_partition where tac='9551';
INSERT INTO TABLE sgs_data_part
PARTITION (day, tac)
SELECT tel, imsi, time, tac_cid day, tac
FROM sgs_data_src;
hive -e "user sgs;select a.imsi, b.tel_num, a.summary from user_summary a left outer join imsi_tel_9551 b on a.imsi=b.imsi;" >> user_summary.txt
hive调用python的udf
udf文件
# encoding: utf-8
'''
@author: zcc
@license: (C) Copyright 2013-2017, Node Supply Chain Manager Corporation Limited.
@contact: ********@163.com
@software: pycharm
@file: hive_udf.py
@time: 8/25/17 9:00 AM
@desc:自定义hive函数处理合并后的用户信息
'''
import sys
from collections import Counter
for line in sys.stdin:
imsi, tac_cids = line.strip().split('\t')
tac_cids = tac_cids.split('|')
# 单个用户16天内累计SGSIMSIAatach和SGSIMSIDetach次数小于10则剔除该用户
if len(tac_cids) < 8: continue
tac_cid_count = Counter(tac_cids)
tac_cid_elements = list()
for key, value in tac_cid_count.iteritems():
if value <= 1: continue
tac_cid_elements.append("{0}:{1}".format(key, value))
print '%s\t%s' % (imsi, ','.join(tac_cid_elements))
add file /home/hadoop/hive_udf.py;
insert overwrite table user_summary select transform(*) using 'python hive_udf.py' as (imsi, tac_cids) from user_info_merge_9551;
清空表
truncate table tel_imsi_9551;
删除表
DROP TABLE [IF EXISTS] table_name;
删除数据库
DROP DATABASE IF EXISTS userdb;
更改表的字段属性
ALTER TABLE table_one CHANGE old_name new_name net_type_name;
删除表的一个指定分区
alter table table_name drop partition (day=partition_name);
查询结果写入一张新表
CREATE TABLE table_name STORED AS SEQUENCEFILE SELECT * FROM src_table_name;