学习笔记--Impala
参考 https://www.cloudera.com/documentation/enterprise/5-5-x/topics/impala_datetime_functions.html
hdfs文件操作
$ whoami
cloudera
$ hdfs dfs -ls /user
Found 3 items
drwxr-xr-x - cloudera cloudera 0 2013-04-22 18:54 /user/cloudera
drwxrwx--- - mapred mapred 0 2013-03-15 20:11 /user/history
drwxr-xr-x - hue supergroup 0 2013-03-15 20:10 /user/hive
$ hdfs dfs -mkdir -p /user/cloudera/sample_data/tab1 /user/cloudera/sample_data/tab2
$ hdfs dfs -put tab1.csv /user/cloudera/sample_data/tab1
$ hdfs dfs -cat /user/hive/tpcds/customer/customer.dat
Impala和关系数据库
Impala | 关系型数据库 |
---|---|
Impala使用类似于HiveQL的类似SQL的查询语言。 | 关系数据库使用SQL语言。 |
在Impala中,您无法更新或删除单个记录。 | 在关系数据库中,可以更新或删除单个记录。 |
Impala不支持事务。 | 关系数据库支持事务。 |
Impala不支持索引。 | 关系数据库支持索引。 |
Impala存储和管理大量数据(PB)。 | 与Impala相比,关系数据库处理的数据量较少(TB)。 |
HBase、Hive和Impala
HBase | Hive | Impala |
---|---|---|
HBase是基于Apache Hadoop的宽列存储数据库。 它使用BigTable的概念。 | Hive是一个数据仓库软件。 使用它,我们可以访问和管理基于Hadoop的大型分布式数据集。 | Impala是一个管理,分析存储在Hadoop上的数据的工具。 |
HBase的数据模型是宽列存储。 | Hive遵循关系模型。 | Impala遵循关系模型。 |
HBase是使用Java语言开发的。 | Hive是使用Java语言开发的。 | Impala是使用C ++开发的。 |
HBase的数据模型是无模式的。 | Hive的数据模型是基于模式的。 | Impala的数据模型是基于模式的。 |
HBase提供Java,RESTful和Thrift API。 | Hive提供JDBC,ODBC,Thrift API。 | Impala提供JDBC和ODBC API。 |
支持C,C#,C ++,Groovy,Java PHP,Python和Scala等编程语言。 | 支持C ++,Java,PHP和Python等编程语言。 | Impala支持所有支持JDBC / ODBC的语言。 |
HBase提供对触发器的支持。 | Hive不提供任何触发器支持。 | Impala不提供对触发器的任何支持。 |
打开Impala-shell
[root@quickstart cloudera] # impala-shell
SQL
利用外部数据文件创建Impala的表
CREATE EXTERNAL TABLE tab1
(
id INT,
col_1 BOOLEAN,
col_2 DOUBLE,
col_3 TIMESTAMP
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/user/cloudera/sample_data/tab1';
创建分区
create database external_partitions;
use external_partitions;
create table logs (field1 string, field2 string, field3 string)
partitioned by (year string, month string , day string, host string)
row format delimited fields terminated by ',';
insert into logs partition (year="2013", month="07", day="28", host="host1") values ("foo","foo","foo");
insert into logs partition (year="2013", month="07", day="28", host="host2") values ("foo","foo","foo");
ALTER TABLE
ALTER TABLE [old_db_name.]old_table_name RENAME TO [new_db_name.]new_table_name
ALTER TABLE name ADD COLUMNS (col_spec[, col_spec ...])
ALTER TABLE name DROP [COLUMN] column_name
ALTER TABLE name CHANGE column_name new_name new_type
ALTER TABLE name REPLACE COLUMNS (col_spec[, col_spec ...])
ALTER TABLE name { ADD [IF NOT EXISTS] | DROP [IF EXISTS] } PARTITION (partition_spec) [PURGE]
ALTER TABLE name RECOVER PARTITIONS
ALTER TABLE name [PARTITION (partition_spec)]
SET { FILEFORMAT file_format
| LOCATION 'hdfs_path_of_directory'
| TBLPROPERTIES (table_properties)
| SERDEPROPERTIES (serde_properties) }
ALTER TABLE name [PARTITION (partition_spec)] SET { CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED }
new_name ::= [new_database.]new_table_name
col_spec ::= col_name type_name
partition_spec ::= partition_col=constant_value
table_properties ::= 'name'='value'[, 'name'='value' ...]
serde_properties ::= 'name'='value'[, 'name'='value' ...]
file_format ::= { PARQUET | TEXTFILE | RCFILE | SEQUENCEFILE | AVRO }
ALTER VIEW
ALTER VIEW [database_name.]view_name AS select_statement
ALTER VIEW [database_name.]view_name RENAME TO [database_name.]view_name
COMPUTE STATS
COMPUTE STATS [db_name.]table_name
COMPUTE INCREMENTAL STATS [db_name.]table_name [PARTITION (partition_spec)]
partition_spec ::= partition_col=constant_value
CREATE DATABASE
CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name[COMMENT 'database_comment']
[LOCATION hdfs_path];
CREATE FUNCTION
--Creates a user-defined function (UDF)
CREATE FUNCTION [IF NOT EXISTS] [db_name.]function_name([arg_type[, arg_type...])
RETURNS return_type
LOCATION 'hdfs_path'
SYMBOL='symbol_or_class'
--user-defined aggregate function (UDA)
CREATE [AGGREGATE] FUNCTION [IF NOT EXISTS] [db_name.]function_name([arg_type[, arg_type...])
RETURNS return_type
LOCATION 'hdfs_path'
[INIT_FN='function]
UPDATE_FN='function
MERGE_FN='function
[PREPARE_FN='function]
[CLOSEFN='function]
[SERIALIZE_FN='function]
[FINALIZE_FN='function]
[INTERMEDIATE type_spec]
CREATE ROLE
CREATE ROLE role_name
CREATE TABLE
--Explicit column definitions:
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
(col_name data_type [COMMENT 'col_comment'], ...)
[PARTITIONED BY (col_name data_type [COMMENT 'col_comment'], ...)]
[COMMENT 'table_comment']
[WITH SERDEPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
[
[ROW FORMAT row_format] [STORED AS file_format]
]
[LOCATION 'hdfs_path']
[TBLPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
[CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED]
--Column definitions inferred from data file:
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
LIKE PARQUET 'hdfs_path_of_parquet_file'
[COMMENT 'table_comment']
[PARTITIONED BY (col_name data_type [COMMENT 'col_comment'], ...)]
[WITH SERDEPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
[
[ROW FORMAT row_format] [STORED AS file_format]
]
[LOCATION 'hdfs_path']
[TBLPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
[CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED]
data_type:
primitive_type
| array_type
| map_type
| struct_type
--CREATE TABLE AS SELECT:
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] db_name.]table_name
[COMMENT 'table_comment']
[WITH SERDEPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
[
[ROW FORMAT row_format] [STORED AS ctas_file_format]
]
[LOCATION 'hdfs_path']
[TBLPROPERTIES ('key1'='value1', 'key2'='value2', ...)]
[CACHED IN 'pool_name' [WITH REPLICATION = integer] | UNCACHED]
AS
select_statement
primitive_type:
TINYINT
| SMALLINT
| INT
| BIGINT
| BOOLEAN
| FLOAT
| DOUBLE
| DECIMAL
| STRING
| CHAR
| VARCHAR
| TIMESTAMP
complex_type:
struct_type
| array_type
| map_type
struct_type: STRUCT < name : primitive_or_complex_type [COMMENT 'comment_string'], ... >
array_type: ARRAY < primitive_or_complex_type >
map_type: MAP < primitive_type, primitive_or_complex_type >
row_format:
DELIMITED [FIELDS TERMINATED BY 'char' [ESCAPED BY 'char']]
[LINES TERMINATED BY 'char']
file_format:
PARQUET
| TEXTFILE
| AVRO
| SEQUENCEFILE
| RCFILE
ctas_file_format:
PARQUET
| TEXTFILE
CREATE VIEW
CREATE VIEW [IF NOT EXISTS] view_name [(column_list)]
AS select_statement
DESCRIBE
DESCRIBE [FORMATTED] [db_name.]table_name[.complex_col_name ...]
DROP DATABASE
DROP (DATABASE|SCHEMA) [IF EXISTS] database_name [RESTRICT | CASCADE];
DROP TABLE
DROP TABLE [IF EXISTS] [db_name.]table_name [PURGE]
DROP VIEW
DROP VIEW [IF EXISTS] [db_name.]view_name
DROP FUNCTION
DROP [AGGREGATE] FUNCTION [IF EXISTS] [db_name.]function_name(type[, type...])
DROP ROLE
DROP ROLE role_name
DROP STATS
DROP STATS [database_name.]table_name
DROP INCREMENTAL STATS [database_name.]table_name PARTITION (partition_spec)
partition_spec ::= partition_col=constant_value
EXPLAIN
EXPLAIN { select_query | ctas_stmt | insert_stmt }
GRANT
GRANT ROLE role_name TO GROUP group_name
GRANT privilege ON object_type object_name
TO [ROLE] roleName
[WITH GRANT OPTION]
privilege ::= SELECT | SELECT(column_name) | INSERT | ALL
object_type ::= TABLE | DATABASE | SERVER | URI
INSERT
[with_clause]
INSERT { INTO | OVERWRITE } [TABLE] table_name
[(column_list)]
[ PARTITION (partition_clause)]
{
[hint_clause] select_statement
| VALUES (value [, value ...]) [, (value [, value ...]) ...]
}
partition_clause ::= col_name [= constant] [, col_name [= constant] ...]
hint_clause ::= [SHUFFLE] | [NOSHUFFLE] (Note: the square brackets are part of the syntax.)
INVALIDATE METADATA
INVALIDATE METADATA [[db_name.]table_name]
LOAD DATA
LOAD DATA INPATH 'hdfs_file_or_directory_path' [OVERWRITE] INTO TABLE tablename
[PARTITION (partcol1=val1, partcol2=val2 ...)]
REFRESH
REFRESH [db_name.]table_name
REVOKE
REVOKE ROLE role_name FROM GROUP group_name
REVOKE privilege ON object_type object_name
FROM [ROLE] role_name
privilege ::= SELECT | SELECT(column_name) | INSERT | ALL
object_type ::= TABLE | DATABASE | SERVER | URI
SELECT
[WITH name AS (select_expression) [, ...] ]
SELECT
[ALL | DISTINCT]
[STRAIGHT_JOIN]
expression [, expression ...]
FROM table_reference [, table_reference ...]
[[FULL | [LEFT | RIGHT] INNER | [LEFT | RIGHT] OUTER | [LEFT | RIGHT] SEMI | [LEFT | RIGHT] ANTI | CROSS]
JOIN table_reference
[ON join_equality_clauses | USING (col1[, col2 ...]] ...
WHERE conditions
GROUP BY { column | expression [ASC | DESC] [NULLS FIRST | NULLS LAST] [, ...] }
HAVING conditions
GROUP BY { column | expression [ASC | DESC] [, ...] }
LIMIT expression [OFFSET expression]
[UNION [ALL] select_statement] ...]
SET
SET [query_option=option_value]
SHOW
SHOW DATABASES [[LIKE] 'pattern']
SHOW SCHEMAS [[LIKE] 'pattern'] - an alias for SHOW DATABASES
SHOW TABLES [IN database_name] [[LIKE] 'pattern']
SHOW [AGGREGATE | ANALYTIC] FUNCTIONS [IN database_name] [[LIKE] 'pattern']
SHOW CREATE TABLE [database_name].table_name
SHOW TABLE STATS [database_name.]table_name
SHOW COLUMN STATS [database_name.]table_name
SHOW PARTITIONS [database_name.]table_name
SHOW FILES IN [database_name.]table_name [PARTITION (key_col=value [, key_col=value]]
SHOW ROLES
SHOW CURRENT ROLES
SHOW ROLE GRANT GROUP group_name
SHOW GRANT ROLE role_name
TRUNCATE TABLE
TRUNCATE TABLE [db_name.]table_name
USE
USE db_name
常用函数
数学函数
abs(numeric_type a)
acos(double a)
asin(double a)
atan(double a)
atan2(double a, double b)
bin(bigint a)
ceil(double a), ceil(decimal(p,s) a), ceiling(double a), ceiling(decimal(p,s) a), dceil(double a), dceil(decimal(p,s) a)
conv(bigint num, int from_base, int to_base), conv(string num, int from_base, int to_base)
cos(double a)
cosh(double a)
cot(double a)
degrees(double a)
e()
exp(double a), dexp(double a)
factorial(integer_type a)
位运算函数
bitand(integer_type a, same_type b)
bitnot(integer_type a)
bitor(integer_type a, same_type b)
bitxor(integer_type a, same_type b)
类型转换函数
cast(expr AS type)
typeof(type value)
时间日期函数
add_months(timestamp date, int months), add_months(timestamp date, bigint months)
adddate(timestamp startdate, int days), adddate(timestamp startdate, bigint days),
current_timestamp()
now()
date_add(timestamp startdate, int days), date_add(timestamp startdate, interval_expression)
date_part(string, timestamp)
date_sub(timestamp startdate, int days), date_sub(timestamp startdate, interval_expression)
datediff(timestamp enddate, timestamp startdate)
day(timestamp date), dayofmonth(timestamp date)
dayname(timestamp date)
dayofweek(timestamp date)
dayofyear(timestamp date)
days_add(timestamp startdate, int days), days_add(timestamp startdate, bigint days)
days_sub(timestamp startdate, int days), days_sub(timestamp startdate, bigint days)
extract(timestamp, string unit)extract(unit FROM timestamp)
条件函数
CASE a WHEN b THEN c [WHEN d THEN e]... [ELSE f] END
if(boolean condition, type ifTrue, type ifFalseOrNull)
ifnull(type a, type ifNull)
isfalse(boolean)
isnotfalse(boolean)
isnull(type a, type ifNull)
istrue(boolean)
字符串函数
ascii(string str)
btrim(string a), btrim(string a, string chars_to_trim)
char_length(string a), character_length(string a)
chr(int character_code)
concat(string a, string b...)
find_in_set(string str, string strList)
length(string a)
lower(string a), lcase(string a)
regexp_extract(string subject, string pattern, int index)
regexp_like(string source, string pattern[, string options])
repeat(string str, int n)
reverse(string a)
substr(string a, int start [, int len]), substring(string a, int start [, int len])
translate(string input, string from, string to)
trim(string a)
upper(string a), ucase(string a)
聚合函数
APPX_MEDIAN([DISTINCT | ALL] expression)
AVG([DISTINCT | ALL] expression) [OVER (analytic_clause)]
COUNT([DISTINCT | ALL] expression) [OVER (analytic_clause)]
GROUP_CONCAT([ALL] expression [, separator])
MAX([DISTINCT | ALL] expression) [OVER (analytic_clause)]
MIN([DISTINCT | ALL] expression) [OVER (analytic_clause)]
NDV([DISTINCT | ALL] expression)
{ STDDEV | STDDEV_SAMP | STDDEV_POP } ([DISTINCT | ALL] expression)
SUM([DISTINCT | ALL] expression) [OVER (analytic_clause)]
{ VARIANCE | VAR[IANCE]_SAMP | VAR[IANCE]_POP } ([DISTINCT | ALL] expression)