学习笔记--Impala

参考 https://www.cloudera.com/documentation/enterprise/5-5-x/topics/impala_datetime_functions.html

hdfs文件操作

$ whoami
cloudera
$ hdfs dfs -ls /user
Found 3 items
drwxr-xr-x  - cloudera cloudera            0 2013-04-22 18:54 /user/cloudera
drwxrwx---  - mapred  mapred              0 2013-03-15 20:11 /user/history
drwxr-xr-x  - hue      supergroup          0 2013-03-15 20:10 /user/hive

$ hdfs dfs -mkdir -p /user/cloudera/sample_data/tab1 /user/cloudera/sample_data/tab2
$ hdfs dfs -put tab1.csv /user/cloudera/sample_data/tab1
$ hdfs dfs -cat /user/hive/tpcds/customer/customer.dat

Impala和关系数据库

Impala 关系型数据库
Impala使用类似于HiveQL的类似SQL的查询语言。 关系数据库使用SQL语言。
在Impala中,您无法更新或删除单个记录。 在关系数据库中,可以更新或删除单个记录。
Impala不支持事务。 关系数据库支持事务。
Impala不支持索引。 关系数据库支持索引。
Impala存储和管理大量数据(PB)。 与Impala相比,关系数据库处理的数据量较少(TB)。

HBase、Hive和Impala

HBase Hive Impala
HBase是基于Apache Hadoop的宽列存储数据库。 它使用BigTable的概念。 Hive是一个数据仓库软件。 使用它,我们可以访问和管理基于Hadoop的大型分布式数据集。 Impala是一个管理,分析存储在Hadoop上的数据的工具。
HBase的数据模型是宽列存储。 Hive遵循关系模型。 Impala遵循关系模型。
HBase是使用Java语言开发的。 Hive是使用Java语言开发的。 Impala是使用C ++开发的。
HBase的数据模型是无模式的。 Hive的数据模型是基于模式的。 Impala的数据模型是基于模式的。
HBase提供Java,RESTful和Thrift API。 Hive提供JDBC,ODBC,Thrift API。 Impala提供JDBC和ODBC API。
支持C,C#,C ++,Groovy,Java PHP,Python和Scala等编程语言。 支持C ++,Java,PHP和Python等编程语言。 Impala支持所有支持JDBC / ODBC的语言。
HBase提供对触发器的支持。 Hive不提供任何触发器支持。 Impala不提供对触发器的任何支持。

打开Impala-shell

[root@quickstart cloudera] # impala-shell 

SQL

利用外部数据文件创建Impala的表

CREATE EXTERNAL TABLE tab1
(
  id INT,
  col_1 BOOLEAN,
  col_2 DOUBLE,
  col_3 TIMESTAMP
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/cloudera/sample_data/tab1';

创建分区

create database external_partitions;
use external_partitions;
create table logs (field1 string, field2 string, field3 string)
  partitioned by (year string, month string , day string, host string)
  row format delimited fields terminated by ',';
insert into logs partition (year="2013", month="07", day="28", host="host1") values ("foo","foo","foo");
insert into logs partition (year="2013", month="07", day="28", host="host2") values ("foo","foo","foo");

ALTER TABLE

ALTER TABLE [old_db_name.]old_table_name RENAME TO [new_db_name.]new_table_name

ALTER TABLE name ADD COLUMNS (col_spec[, col_spec ...])
ALTER TABLE name DROP [COLUMN] column_name
ALTER TABLE name CHANGE column_name new_name new_type
ALTER TABLE name REPLACE COLUMNS (col_spec[, col_spec ...])

ALTER TABLE name { ADD [IF NOT EXISTS] | DROP [IF EXISTS] } PARTITION (partition_spec) [PURGE]
ALTER TABLE name RECOVER PARTITIONS

ALTER TABLE name [PARTITION (partition_spec)]
  SET { FILEFORMAT file_format
  | LOCATION 'hdfs_path_of_directory'
  | TBLPROPERTIES (table_properties)
  | SERDEPROPERTIES (serde_properties) }

ALTER TABLE name [PARTITION (partition_spec)] SET { CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED }

new_name ::= [new_database.]new_table_name

col_spec ::= col_name type_name

partition_spec ::= partition_col=constant_value

table_properties ::= 'name'='value'[, 'name'='value' ...]

serde_properties ::= 'name'='value'[, 'name'='value' ...]

file_format ::= { PARQUET | TEXTFILE | RCFILE | SEQUENCEFILE | AVRO }

ALTER VIEW

ALTER VIEW [database_name.]view_name AS select_statement
ALTER VIEW [database_name.]view_name RENAME TO [database_name.]view_name

COMPUTE STATS

COMPUTE STATS [db_name.]table_name
COMPUTE INCREMENTAL STATS [db_name.]table_name [PARTITION (partition_spec)]

partition_spec ::= partition_col=constant_value

CREATE DATABASE

CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name[COMMENT 'database_comment']
  [LOCATION hdfs_path];

CREATE FUNCTION

--Creates a user-defined function (UDF)
CREATE FUNCTION [IF NOT EXISTS] [db_name.]function_name([arg_type[, arg_type...])
  RETURNS return_type
  LOCATION 'hdfs_path'
  SYMBOL='symbol_or_class'
--user-defined aggregate function (UDA)
CREATE [AGGREGATE] FUNCTION [IF NOT EXISTS] [db_name.]function_name([arg_type[, arg_type...])
  RETURNS return_type
  LOCATION 'hdfs_path'
  [INIT_FN='function]
  UPDATE_FN='function
  MERGE_FN='function
  [PREPARE_FN='function]
  [CLOSEFN='function]
  [SERIALIZE_FN='function]
  [FINALIZE_FN='function]
  [INTERMEDIATE type_spec]

CREATE ROLE

CREATE ROLE role_name

CREATE TABLE

--Explicit column definitions:
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
  (col_name data_type [COMMENT 'col_comment'], ...)
  [PARTITIONED BY (col_name data_type [COMMENT 'col_comment'], ...)]
  [COMMENT 'table_comment']
  [WITH SERDEPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
  [
  [ROW FORMAT row_format] [STORED AS file_format]
  ]
  [LOCATION 'hdfs_path']
  [TBLPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
  [CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED]
--Column definitions inferred from data file:
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
  LIKE PARQUET 'hdfs_path_of_parquet_file'
  [COMMENT 'table_comment']
  [PARTITIONED BY (col_name data_type [COMMENT 'col_comment'], ...)]
  [WITH SERDEPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
  [
  [ROW FORMAT row_format] [STORED AS file_format]
  ]
  [LOCATION 'hdfs_path']
  [TBLPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
  [CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED]
data_type:
    primitive_type
  | array_type
  | map_type
  | struct_type
--CREATE TABLE AS SELECT:
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] db_name.]table_name
  [COMMENT 'table_comment']
  [WITH SERDEPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
  [
  [ROW FORMAT row_format] [STORED AS ctas_file_format]
  ]
  [LOCATION 'hdfs_path']
  [TBLPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
  [CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED]
AS
  select_statement
primitive_type:
    TINYINT
  | SMALLINT
  | INT
  | BIGINT
  | BOOLEAN
  | FLOAT
  | DOUBLE
  | DECIMAL
  | STRING
  | CHAR
  | VARCHAR
  | TIMESTAMP

complex_type:
    struct_type
  | array_type
  | map_type

struct_type: STRUCT < name : primitive_or_complex_type [COMMENT 'comment_string'], ... >

array_type: ARRAY < primitive_or_complex_type >

map_type: MAP < primitive_type, primitive_or_complex_type >

row_format:
  DELIMITED [FIELDS TERMINATED BY 'char' [ESCAPED BY 'char']]
  [LINES TERMINATED BY 'char']

file_format:
    PARQUET
  | TEXTFILE
  | AVRO
  | SEQUENCEFILE
  | RCFILE

ctas_file_format:
    PARQUET
  | TEXTFILE

CREATE VIEW

CREATE VIEW [IF NOT EXISTS] view_name [(column_list)]
  AS select_statement

DESCRIBE

DESCRIBE [FORMATTED] [db_name.]table_name[.complex_col_name ...]

DROP DATABASE

DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT | CASCADE];

DROP TABLE

DROP TABLE [IF EXISTS] [db_name.]table_name [PURGE]

DROP VIEW

DROP VIEW [IF EXISTS] [db_name.]view_name

DROP FUNCTION

DROP [AGGREGATE] FUNCTION [IF EXISTS] [db_name.]function_name(type[, type...])

DROP ROLE

DROP ROLE role_name

DROP STATS

DROP STATS [database_name.]table_name
DROP INCREMENTAL STATS [database_name.]table_name PARTITION (partition_spec)

partition_spec ::= partition_col=constant_value

EXPLAIN

EXPLAIN { select_query | ctas_stmt | insert_stmt }

GRANT

GRANT ROLE role_name TO GROUP group_name

GRANT privilege ON object_type object_name
  TO [ROLE] roleName
  [WITH GRANT OPTION]

privilege ::= SELECT | SELECT(column_name) | INSERT | ALL
object_type ::= TABLE | DATABASE | SERVER | URI

INSERT

[with_clause]
INSERT { INTO | OVERWRITE } [TABLE] table_name
  [(column_list)]
  [ PARTITION (partition_clause)]
{
    [hint_clause] select_statement
  | VALUES (value [, value ...]) [, (value [, value ...]) ...]
}

partition_clause ::= col_name [= constant] [, col_name [= constant] ...]

hint_clause ::= [SHUFFLE] | [NOSHUFFLE]    (Note: the square brackets are part of the syntax.)

INVALIDATE METADATA

INVALIDATE METADATA [[db_name.]table_name]

LOAD DATA

LOAD DATA INPATH 'hdfs_file_or_directory_path' [OVERWRITE] INTO TABLE tablename
  [PARTITION (partcol1=val1, partcol2=val2 ...)]

REFRESH

REFRESH [db_name.]table_name

REVOKE

REVOKE ROLE role_name FROM GROUP group_name

REVOKE privilege ON object_type object_name
  FROM [ROLE] role_name

privilege ::= SELECT | SELECT(column_name) | INSERT | ALL
object_type ::= TABLE | DATABASE | SERVER | URI

SELECT

[WITH name AS (select_expression) [, ...] ]
SELECT
  [ALL | DISTINCT]
  [STRAIGHT_JOIN]
  expression [, expression ...]
FROM table_reference [, table_reference ...]
[[FULL | [LEFT | RIGHT] INNER | [LEFT | RIGHT] OUTER | [LEFT | RIGHT] SEMI | [LEFT | RIGHT] ANTI | CROSS]
  JOIN table_reference
  [ON join_equality_clauses | USING (col1[, col2 ...]] ...
WHERE conditions
GROUP BY { column | expression [ASC | DESC] [NULLS FIRST | NULLS LAST] [, ...] }
HAVING conditions
GROUP BY { column | expression [ASC | DESC] [, ...] }
LIMIT expression [OFFSET expression]
[UNION [ALL] select_statement] ...]

SET

SET [query_option=option_value]

SHOW

SHOW DATABASES [[LIKE] 'pattern']
SHOW SCHEMAS [[LIKE] 'pattern'] - an alias for SHOW DATABASES
SHOW TABLES [IN database_name] [[LIKE] 'pattern']
SHOW [AGGREGATE | ANALYTIC] FUNCTIONS [IN database_name] [[LIKE] 'pattern']
SHOW CREATE TABLE [database_name].table_name
SHOW TABLE STATS [database_name.]table_name
SHOW COLUMN STATS [database_name.]table_name
SHOW PARTITIONS [database_name.]table_name
SHOW FILES IN [database_name.]table_name [PARTITION (key_col=value [, key_col=value]]

SHOW ROLES
SHOW CURRENT ROLES
SHOW ROLE GRANT GROUP group_name
SHOW GRANT ROLE role_name

TRUNCATE TABLE

TRUNCATE TABLE [db_name.]table_name

USE

USE db_name

常用函数

数学函数

abs(numeric_type a)
acos(double a)
asin(double a)
atan(double a)
atan2(double a, double b)
bin(bigint a)
ceil(double a), ceil(decimal(p,s) a), ceiling(double a), ceiling(decimal(p,s) a), dceil(double a), dceil(decimal(p,s) a)
conv(bigint num, int from_base, int to_base), conv(string num, int from_base, int to_base)
cos(double a)
cosh(double a)
cot(double a)
degrees(double a)
e()
exp(double a), dexp(double a)
factorial(integer_type a)

位运算函数

bitand(integer_type a, same_type b)
bitnot(integer_type a)
bitor(integer_type a, same_type b)
bitxor(integer_type a, same_type b)

类型转换函数

cast(expr AS type)
typeof(type value)

时间日期函数

add_months(timestamp date, int months), add_months(timestamp date, bigint months)
adddate(timestamp startdate, int days), adddate(timestamp startdate, bigint days),
current_timestamp()
now()
date_add(timestamp startdate, int days), date_add(timestamp startdate, interval_expression)
date_part(string, timestamp)
date_sub(timestamp startdate, int days), date_sub(timestamp startdate, interval_expression)
datediff(timestamp enddate, timestamp startdate)
day(timestamp date), dayofmonth(timestamp date)
dayname(timestamp date)
dayofweek(timestamp date)
dayofyear(timestamp date)
days_add(timestamp startdate, int days), days_add(timestamp startdate, bigint days)
days_sub(timestamp startdate, int days), days_sub(timestamp startdate, bigint days)
extract(timestamp, string unit)extract(unit FROM timestamp)

条件函数

CASE a WHEN b THEN c [WHEN d THEN e]... [ELSE f] END
if(boolean condition, type ifTrue, type ifFalseOrNull)
ifnull(type a, type ifNull)
isfalse(boolean)
isnotfalse(boolean)
isnull(type a, type ifNull)
istrue(boolean)

字符串函数

ascii(string str)
btrim(string a), btrim(string a, string chars_to_trim)
char_length(string a), character_length(string a)
chr(int character_code)
concat(string a, string b...)
find_in_set(string str, string strList)
length(string a)
lower(string a), lcase(string a)
regexp_extract(string subject, string pattern, int index)
regexp_like(string source, string pattern[, string options])
repeat(string str, int n)
reverse(string a)
substr(string a, int start [, int len]), substring(string a, int start [, int len])
translate(string input, string from, string to)
trim(string a)
upper(string a), ucase(string a)

聚合函数

APPX_MEDIAN([DISTINCT | ALL] expression)
AVG([DISTINCT | ALL] expression) [OVER (analytic_clause)]
COUNT([DISTINCT | ALL] expression) [OVER (analytic_clause)]
GROUP_CONCAT([ALL] expression [, separator])
MAX([DISTINCT | ALL] expression) [OVER (analytic_clause)]
MIN([DISTINCT | ALL] expression) [OVER (analytic_clause)]
NDV([DISTINCT | ALL] expression)
{ STDDEV | STDDEV_SAMP | STDDEV_POP } ([DISTINCT | ALL] expression)
SUM([DISTINCT | ALL] expression) [OVER (analytic_clause)]
{ VARIANCE | VAR[IANCE]_SAMP | VAR[IANCE]_POP } ([DISTINCT | ALL] expression)
posted @ 2018-04-13 23:37  KioLuo  阅读(478)  评论(0编辑  收藏  举报