Hive学习总结

Hive学习总结

环境配置

  • 本人用的是mysql存储hive元数据,需要提前下载mysql

  • hive-env.sh

    # Set HADOOP_HOME to point to a specific hadoop install directory
    HADOOP_HOME=/export/servers/hadoop-2.7.5
    
    
    # Hive Configuration Directory can be controlled by:
    export HIVE_CONF_DIR=/export/servers/apache-hive-2.1.1-bin/conf
    
  • hive-site.xml

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
    <configuration>
    <property>
          <name>javax.jdo.option.ConnectionUserName</name>
          <value>root</value>
      </property>
      <property>
          <name>javax.jdo.option.ConnectionPassword</name>
          <value>root</value>
      </property>
      <property>
          <name>javax.jdo.option.ConnectionURL</name>
          <value>jdbc:mysql://node03:3306/hive?createDatabaseIfNotExist=true&amp;useSSL=false</value>
      </property>
      <property>
          <name>javax.jdo.option.ConnectionDriverName</name>
          <value>com.mysql.jdbc.Driver</value>
      </property>
      <property>
          <name>hive.metastore.schema.verification</name>
          <value>false</value>
      </property>
      <property>
        <name>datanucleus.schema.autoCreateAll</name>
        <value>true</value>
     </property>
     <property>
    		<name>hive.server2.thrift.bind.host</name>
    		<value>node03</value>
       </property>
    </configuration>
    
    

Hive常用命令

  • 进入Hive命令模式

    bin/hive
    #无需进入hive模式执行hive命令
    bin/hive -e 'show databases'
    
  • 创建(外部)表并指定字段分隔符

    create [external] table student(
    			sid string,
                name string
    ) [partitined by (year string,month string,day string)] 
    row format delimited fields terminated by '\t';
    
  • 加载数据

    load data [local] inpath '/hivedatas/student.csv' [overwrite] into table student [partition (key='')];
    
    #分桶表加载数据
    #创建分桶表
    create table course (c_id string,c_name string,t_id string) clustered by(c_id) into 3 buckets row format delimited fields terminated by '\t';
    #创建临时表(字段名要对应)
    create table course_common (c_id string,c_name string,t_id string) row 
    format delimited fields terminated by '\t';
    #临时表加载数据
    load data local inpath '/export/servers/hivedatas/course.csv' into table course_common;
    #分桶表加载数据
    insert overwrite table course select * from course_common cluster 
    by(c_id);
    
  • 查询语句

    #语法基本上与SQL一样,增加了分区和分桶的概念,底层通过多次的MapReduce实现
    select [distinct] sid from student 
    [where sid='123']
    [group by sid [having name like '李%']]
    [cluster by sid
    | [distribute by sid] [sort by | order by sid]
    ]
    [limit 1,10];
    #group by命令会按照指定的关键字进行分组计算,计算效率较高,但如果有的关键字数量比较少、有的比较多,可能会造成数据倾斜,拖慢整体的效率。
    #distribute by相当于MapReduce中的分区,sort by是Map阶段完成后进行局部排序;order by是全局排序;当distribute 和 sort的字段相同时可以用cluster替换,但排序顺序只能是倒序。
    
  • 查看数据库、表信息

    desc database 数据库名;
    desc [formatted] student;
    
  • 查询结果写入本地文件

    insert overwrite local directory '/data/' select * from student;
    
  • 设置reduce个数

    set mapreduce.job.reduces=3;
    
  • 开启分桶

    set hive.enforce.bucketing=true;
    
posted @ 2021-02-02 23:08  公鸡不下蛋  阅读(105)  评论(0编辑  收藏  举报