Hive学习笔记——parser

Hive是如何解析SQL的呢,首先拿hive的建表语句来举例,比如下面的建表语句

create table test(id int,name string)row format delimited fields terminated by '\t';

然后使用hive的show create table语句来查看创建的表结构,这是一张text表

CREATE TABLE `test`(
  `id` int, 
  `name` string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
WITH SERDEPROPERTIES ( 
  'field.delim'='\t', 
  'serialization.format'='\t') 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://master:8020/user/hive/warehouse/test'
TBLPROPERTIES (
  'transient_lastDdlTime'='1568561230')

当然还有其他各种建表语句,比如

csv表

CREATE EXTERNAL TABLE `default.test_1`(
	  `key` string COMMENT 'from deserializer', 
	  `value` string COMMENT 'from deserializer')
	ROW FORMAT SERDE 
	  'org.apache.hadoop.hive.serde2.OpenCSVSerde' 
	WITH SERDEPROPERTIES ( 
	  'escapeChar'='\\', 
	  'quoteChar'='\'', 
	  'separatorChar'='\t') 
	STORED AS INPUTFORMAT 
	  'org.apache.hadoop.mapred.TextInputFormat' 
	OUTPUTFORMAT 
	  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
	LOCATION
	  'hdfs://master:8020/user/hive/warehouse/test'
	TBLPROPERTIES (
	  'COLUMN_STATS_ACCURATE'='false', 
	  'numFiles'='0', 
	  'numRows'='-1', 
	  'rawDataSize'='-1', 
	  'totalSize'='0', 
	  'transient_lastDdlTime'='xxxx')

 parquet表

CREATE TABLE `default.test`(
	  `time` string, 
	  `server` int, 
	  `id` bigint)
	PARTITIONED BY ( 
	  `ds` string)
	ROW FORMAT SERDE 
	  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
	WITH SERDEPROPERTIES ( 
	  'field.delim'='\t', 
	  'serialization.format'='\t') 
	STORED AS INPUTFORMAT 
	  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
	OUTPUTFORMAT 
	  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
	LOCATION
	  'hdfs://master:8020/user/hive/warehouse/test'
	TBLPROPERTIES (
  'transient_lastDdlTime'='xxxx')

 json表

CREATE EXTERNAL TABLE `default.test`(
	  `titleid` string COMMENT 'from deserializer', 
	  `timestamp` string COMMENT 'from deserializer')
	ROW FORMAT SERDE 
	  'org.openx.data.jsonserde.JsonSerDe' 
	STORED AS INPUTFORMAT 
	  'org.apache.hadoop.mapred.TextInputFormat' 
	OUTPUTFORMAT 
	  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
	LOCATION
	  'hdfs://master:8020/user/hive/warehouse/test'
	TBLPROPERTIES (
	  'COLUMN_STATS_ACCURATE'='false', 
	  'numFiles'='0', 
	  'numRows'='-1', 
	  'rawDataSize'='-1', 
	  'totalSize'='0', 

 es表

CREATE EXTERNAL TABLE `default.test`(
	  `id` string COMMENT 'from deserializer', 
	  `ts` string COMMENT 'from deserializer', ')
	PARTITIONED BY ( 
	  `ds` string)
	ROW FORMAT SERDE 
	  'org.elasticsearch.hadoop.hive.EsSerDe' 
	STORED BY 
	  'org.elasticsearch.hadoop.hive.EsStorageHandler' 
	WITH SERDEPROPERTIES ( 
	  'serialization.format'='1')
	LOCATION
	  'hdfs://master:8020/user/hive/warehouse/test'
	TBLPROPERTIES (
	  'es.index.auto.create'='yes', 
	  'es.index.read.missing.as.empty'='yes', 
	  'es.nodes'='host1,host2', 
	  'es.port'='9200', 
	  'es.resource'='index1/type1', 

使用thrift的binary表

CREATE EXTERNAL TABLE `default.test`(
	  `bbb` string COMMENT 'from deserializer', 
	  `aaa` string COMMENT 'from deserializer')
	COMMENT 'aas'
	PARTITIONED BY ( 
	  `ds` string COMMENT '日期分区')
	ROW FORMAT SERDE 
	  'org.apache.hadoop.hive.serde2.thrift.ThriftDeserializer' 
	WITH SERDEPROPERTIES ( 
	  'serialization.class'='com.xxx.xxx.xxx.tables.v1.XXXX', 
	  'serialization.format'='org.apache.thrift.protocol.TCompactProtocol') 
	STORED AS INPUTFORMAT 
	  'org.apache.hadoop.mapred.SequenceFileInputFormat' 
	OUTPUTFORMAT 
	  'org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat'
	LOCATION
	  'hdfs://master:8020/user/hive/warehouse/test'
	TBLPROPERTIES (
	  'transient_lastDdlTime'='xxxxxx')

hbase表

CREATE EXTERNAL TABLE default.hbase_table(key string, k1 string,k2 string,k3 string,k4 string,ts string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.hbase.HBaseSerDe'
STORED BY
'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping"=":key,c:k1,c:k2,c:k3,c:k4,c:ts")
TBLPROPERTIES("hbase.table.name" = "xxxxx");

mongo表

CREATE EXTERNAL TABLE IF NOT EXISTS xx.xx (
k1 string,
k2 int
)
STORED BY 'com.mongodb.hadoop.hive.MongoStorageHandler'
WITH SERDEPROPERTIES('mongo.columns.mapping'='{"k1":"k1","k2":"k2"}','mongo.input.split_size'='16384')
TBLPROPERTIES('mongo.uri'='mongodb://xxx:xxx/table1.collection1');

等等

可以查看show create table的hive源码

https://github.com/apache/hive/blob/68ae4a5cd1b916098dc1deb2bcede5f862afd80e/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/creation/ShowCreateTableOperation.java

其中可以看出hive表的一些基本信息

private static final String CREATE_TABLE_TEMPLATE =
      "CREATE <" + TEMPORARY + "><" + EXTERNAL + ">TABLE `<" + NAME + ">`(\n" +
      "<" + LIST_COLUMNS + ">)\n" +
      "<" + COMMENT + ">\n" +
      "<" + PARTITIONS + ">\n" +
      "<" + BUCKETS + ">\n" +
      "<" + SKEWED + ">\n" +
      "<" + ROW_FORMAT + ">\n" +
      "<" + LOCATION_BLOCK + ">" +
      "TBLPROPERTIES (\n" +
      "<" + PROPERTIES + ">)\n";

  private String getCreateTableCommand(Table table) {
    ST command = new ST(CREATE_TABLE_TEMPLATE);

    command.add(NAME, desc.getTableName());
    command.add(TEMPORARY, getTemporary(table));
    command.add(EXTERNAL, getExternal(table));
    command.add(LIST_COLUMNS, getColumns(table));
    command.add(COMMENT, getComment(table));
    command.add(PARTITIONS, getPartitions(table));
    command.add(BUCKETS, getBuckets(table));
    command.add(SKEWED, getSkewed(table));
    command.add(ROW_FORMAT, getRowFormat(table));
    command.add(LOCATION_BLOCK, getLocationBlock(table));
    command.add(PROPERTIES, getProperties(table));

    return command.render();
  }

当用户输入一行create table语句的时候,可查看源码

https://github.com/apache/hive/blob/ff98efa7c6f2b241d8fddd0ac8dc55e817ecb234/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseUtils.java

美团点评 Hive SQL的编译过程

https://tech.meituan.com/2014/02/12/hive-sql-to-mapreduce.html

其中可以看到,建表语句首先会使用antlr4将其转换成一颗语法树

public static ASTNode parse(String command) throws ParseException {
    return parse(command, null);
  }

然后可以使用getTable抽取其中的库名和表名

https://github.com/apache/hive/blob/f37c5de6c32b9395d1b34fa3c02ed06d1bfbf6eb/ql/src/java/org/apache/hadoop/hive/ql/parse/AnalyzeCommandUtils.java

源码

public static Table getTable(ASTNode tree, BaseSemanticAnalyzer sa) throws SemanticException {
    String tableName = ColumnStatsSemanticAnalyzer.getUnescapedName((ASTNode) tree.getChild(0).getChild(0));
    String currentDb = SessionState.get().getCurrentDatabase();
    String [] names = Utilities.getDbTableName(currentDb, tableName);
    return sa.getTable(names[0], names[1], true);
  }

 

https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/parse/ParseDriver.java

 

public ASTNode parse(String command) throws ParseException {
    return parse(command, null);
  }

然后比如要提取inputformat,outpurformat,serde和storageHandler

https://github.com/apache/hive/blob/f37c5de6c32b9395d1b34fa3c02ed06d1bfbf6eb/ql/src/java/org/apache/hadoop/hive/ql/parse/StorageFormat.java

源码

 

要提取字段信息,SkewedValue,表名以及row format

https://github.com/apache/hive/blob/f37c5de6c32b9395d1b34fa3c02ed06d1bfbf6eb/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java

源码

public static List<FieldSchema> getColumns(
      ASTNode ast, boolean lowerCase, TokenRewriteStream tokenRewriteStream,
      List<SQLPrimaryKey> primaryKeys, List<SQLForeignKey> foreignKeys,
      List<SQLUniqueConstraint> uniqueConstraints, List<SQLNotNullConstraint> notNullConstraints,
      List<SQLDefaultConstraint> defaultConstraints, List<SQLCheckConstraint> checkConstraints,
      Configuration conf) throws SemanticException {
我是源码
}

源码

 /**
   * Get the unqualified name from a table node.
   *
   * This method works for table names qualified with their schema (e.g., "db.table")
   * and table names without schema qualification. In both cases, it returns
   * the table name without the schema.
   *
   * @param node the table node
   * @return the table name without schema qualification
   *         (i.e., if name is "db.table" or "table", returns "table")
   */
  public static String getUnescapedUnqualifiedTableName(ASTNode node) {
    assert node.getChildCount() <= 2;

    if (node.getChildCount() == 2) {
      node = (ASTNode) node.getChild(1);
    }

    return getUnescapedName(node);
  }

源码

  protected void analyzeRowFormat(ASTNode child) throws SemanticException {
      child = (ASTNode) child.getChild(0);
      int numChildRowFormat = child.getChildCount();
      for (int numC = 0; numC < numChildRowFormat; numC++) {
        ASTNode rowChild = (ASTNode) child.getChild(numC);
        switch (rowChild.getToken().getType()) {
        case HiveParser.TOK_TABLEROWFORMATFIELD:
          fieldDelim = unescapeSQLString(rowChild.getChild(0)
              .getText());
          if (rowChild.getChildCount() >= 2) {
            fieldEscape = unescapeSQLString(rowChild
                .getChild(1).getText());
          }
          break;
        case HiveParser.TOK_TABLEROWFORMATCOLLITEMS:
          collItemDelim = unescapeSQLString(rowChild
              .getChild(0).getText());
          break;
        case HiveParser.TOK_TABLEROWFORMATMAPKEYS:
          mapKeyDelim = unescapeSQLString(rowChild.getChild(0)
              .getText());
          break;
        case HiveParser.TOK_TABLEROWFORMATLINES:
          lineDelim = unescapeSQLString(rowChild.getChild(0)
              .getText());
          if (!lineDelim.equals("\n")
              && !lineDelim.equals("10")) {
            throw new SemanticException(SemanticAnalyzer.generateErrorMessage(rowChild,
                ErrorMsg.LINES_TERMINATED_BY_NON_NEWLINE.getMsg()));
          }
          break;
        case HiveParser.TOK_TABLEROWFORMATNULL:
          nullFormat = unescapeSQLString(rowChild.getChild(0)
                    .getText());
          break;
        default:
          throw new AssertionError("Unkown Token: " + rowChild);
        }
      }
    }
  }

分区信息,首先通过取得Map对象

https://github.com/apache/hive/blob/6f18bbbc2e030ce7d446b2475037203cbd4f860d/ql/src/java/org/apache/hadoop/hive/ql/parse/AnalyzeCommandUtils.java

源码

  public static Map<String,String> getPartKeyValuePairsFromAST(Table tbl, ASTNode tree,
      HiveConf hiveConf) throws SemanticException {
    ASTNode child = ((ASTNode) tree.getChild(0).getChild(1));
    Map<String,String> partSpec = new HashMap<String, String>();
    if (child != null) {
      partSpec = DDLSemanticAnalyzer.getValidatedPartSpec(tbl, child, hiveConf, false);
    } //otherwise, it is the case of analyze table T compute statistics for columns;
    return partSpec;
  }

再转换成List<Partition>对象

https://github.com/apache/hive/blob/556531182dc989e12fd491d951b353b4df13fd47/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java

源码

public Map<String, String> partSpec; // has to use LinkedHashMap to enforce order
public List<Partition> partitions; // involved partitions in TableScanOperator/FileSinkOperator partitions = db.getPartitions(table, partSpec);

location信息,parsedLocation

https://github.com/apache/hive/blob/0213afb8a31af1f48d009edd41cec9e6c8942354/ql/src/java/org/apache/hadoop/hive/ql/parse/ImportSemanticAnalyzer.java

 

posted @ 2019-09-15 17:23  tonglin0325  阅读(2663)  评论(0编辑  收藏  举报