hive源码(三)AST->QB->OperatorTree
这一篇先把流程走通,后面会接着写里面的详细步骤
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer
analyzeInternal方法
public void analyzeInternal(ASTNode ast) throws SemanticException {
analyzeInternal(ast, new PlannerContextFactory() {
@Override
public PlannerContext create() {
return new PlannerContext();
}
});
}
void analyzeInternal(ASTNode ast, PlannerContextFactory pcf) throws SemanticException {
LOG.info("Starting Semantic Analysis");
boolean needsTransform = needsTransform();
//select id,user_id from data group by 1,2 order by 2 desc ,1 ;
//hive中 groupby orderby 后面可以写select的位置不用写具体的字段了
//生产环境中不要这样写,读起来不方便
//这一段代码就是把字段给你替换回来了
processPositionAlias(ast);
PlannerContext plannerCtx = pcf.create();
//这一段代码其实是把AST树解析成QB,这一段代码下面详细看
//这一步有一个重要的操作 就是初始化了org.apache.hadoop.hive.ql.parse.QB 其实这就是一个sql的实现类 里面可以将sql的各个部分包装起来
if (!genResolvedParseTree(ast, plannerCtx)) {
return;
}
if (HiveConf.getBoolVar(conf, ConfVars.HIVE_REMOVE_ORDERBY_IN_SUBQUERY)) {
for (String alias : qb.getSubqAliases()) {
//子查询或者视图中没有limit的order将被移除order
removeOBInSubQuery(qb.getSubqForAlias(alias));
}
}
//是否需要缓存的配置参数 其实设置缓存参数是没有用的
//org.apache.hadoop.hive.cli.CliDriver类processLocalCmd 方法 278行 qp.close();清理各种资源 其实把缓存类都清理了
boolean isCacheEnabled = isResultsCacheEnabled();
QueryResultsCache.LookupInfo lookupInfo = null;
if (isCacheEnabled && !needsTransform && queryTypeCanUseCache()) {
lookupInfo = createLookupInfoForQuery(ast);
if (checkResultsCache(lookupInfo)) {
return;
}
}
ASTNode astForMasking;
if (isCBOExecuted() && needsTransform &&
(qb.isCTAS() || qb.isView() || qb.isMaterializedView() || qb.isMultiDestQuery())) {
astForMasking = (ASTNode) ParseDriver.adaptor.dupTree(ast);
} else {
astForMasking = ast;
}
//这一块代码太复杂了,debug了最终结果 重要变量如下:
//sinkOp 里面很多为空的,枚举一下重要的变量==>conf变量,里面有文件的临时目录,输入输出文件格式,最终结果 rowSchema (_col0: bigint,_col1: string)write_type=NOT_ACID
//下面有genOPTree方法的部分解释
Operator sinkOp = genOPTree(ast, plannerCtx);
......
// 将结果的类型推导出来 我这的例子 userid string ; is bigint
if (createVwDesc != null && !this.ctx.isCboSucceeded()) {
resultSchema = convertRowSchemaToViewSchema(opParseCtx.get(sinkOp).getRowResolver());
} else {
if (resultSchema == null) {
resultSchema = convertRowSchemaToResultSetSchema(opParseCtx.get(sinkOp).getRowResolver(),
HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_RESULTSET_USE_UNIQUE_COLUMN_NAMES));
}
}
//拷贝 org.apache.hadoop.hive.ql.parse.QB 部分属性
copyInfoToQueryProperties(queryProperties);
ParseContext pCtx = new ParseContext(queryState, opToPartPruner, opToPartList, topOps,
new HashSet<JoinOperator>(joinContext.keySet()),
new HashSet<SMBMapJoinOperator>(smbMapJoinContext.keySet()),
loadTableWork, loadFileWork, columnStatsAutoGatherContexts, ctx, idToTableNameMap, destTableId, uCtx,
listMapJoinOpsNoReducer, prunedPartitions, tabNameToTabObject, opToSamplePruner,
globalLimitCtx, nameToSplitSample, inputs, rootTasks, opToPartToSkewedPruner,
viewAliasToInput, reduceSinkOperatorsAddedByEnforceBucketingSorting,
analyzeRewrite, tableDesc, createVwDesc, materializedViewUpdateDesc,
queryProperties, viewProjectToTableSchema, acidFileSinks);
pCtx.setSemiJoinHints(parseSemiJoinHint(getQB().getParseInfo().getHintList()));
pCtx.setDisableMapJoin(disableMapJoinWithHint(getQB().getParseInfo().getHintList()));
//创建视图 代码 直接跳过
.......
//生成表的统计信息,更好的为了表的分桶。它这的意思应该是说,根据key的个数确定分桶桶的个数
if (HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_TABLEKEYS)) {
TableAccessAnalyzer tableAccessAnalyzer = new TableAccessAnalyzer(pCtx);
setTableAccessInfo(tableAccessAnalyzer.analyzeTableAccess());
}
if (LOG.isDebugEnabled()) {
LOG.debug("Before logical optimization\n" + Operator.toString(pCtx.getTopOps().values()));
}
Optimizer optm = new Optimizer();
optm.setPctx(pCtx);
optm.initialize(conf);
//优化器都会执行实现这个方法,通过这调用执行优化过程
//类 org.apache.hadoop.hive.ql.optimizer.Optimizer 代码transformations.add
pCtx = optm.optimize();
if (pCtx.getColumnAccessInfo() != null) {
setColumnAccessInfo(pCtx.getColumnAccessInfo());
}
if (LOG.isDebugEnabled()) {
LOG.debug("After logical optimization\n" + Operator.toString(pCtx.getTopOps().values()));
}
//表的统计信息 暂时不关注
boolean isColumnInfoNeedForAuth = SessionState.get().isAuthorizationModeV2()
&& HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_AUTHORIZATION_ENABLED);
if (isColumnInfoNeedForAuth
|| HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_SCANCOLS)) {
ColumnAccessAnalyzer columnAccessAnalyzer = new ColumnAccessAnalyzer(pCtx);
setColumnAccessInfo(columnAccessAnalyzer.analyzeColumnAccess(this.getColumnAccessInfo()));
}
//逻辑执行计划生成物理执行计划 后面详解
if (!ctx.getExplainLogical()) {
TaskCompiler compiler = TaskCompilerFactory.getCompiler(conf, pCtx);
compiler.init(queryState, console, db);
compiler.compile(pCtx, rootTasks, inputs, outputs);
fetchTask = pCtx.getFetchTask();
}
QueryPlanPostProcessor qp = new QueryPlanPostProcessor(rootTasks, acidFileSinks, ctx.getExecutionId());
final Optional<TezTask> optionalTezTask =
rootTasks.stream().filter(task -> task instanceof TezTask).map(task -> (TezTask) task)
.findFirst();
if (optionalTezTask.isPresent()) {
final TezTask tezTask = optionalTezTask.get();
rootTasks.stream()
.filter(task -> task.getWork() instanceof DDLWork)
.map(task -> (DDLWork) task.getWork())
.filter(ddlWork -> ddlWork.getPreInsertTableDesc() != null)
.map(ddlWork -> ddlWork.getPreInsertTableDesc())
.map(ddlPreInsertTask -> new InsertCommitHookDesc(ddlPreInsertTask.getTable(),
ddlPreInsertTask.isOverwrite()))
.forEach(insertCommitHookDesc -> tezTask.addDependentTask(
TaskFactory.get(new DDLWork(getInputs(), getOutputs(), insertCommitHookDesc), conf)));
}
LOG.info("Completed plan generation");
//缓存相关 暂不关注
if (HiveConf.getBoolVar(this.conf, HiveConf.ConfVars.HIVE_STATS_COLLECT_SCANCOLS)) {
putAccessedColumnsToReadEntity(inputs, columnAccessInfo);
}
if (isCacheEnabled && lookupInfo != null) {
if (queryCanBeCached()) {
QueryResultsCache.QueryInfo queryInfo = createCacheQueryInfoForQuery(lookupInfo);
setCacheUsage(new CacheUsage(
CacheUsage.CacheStatus.CAN_CACHE_QUERY_RESULTS, queryInfo));
}
}
}
if (!genResolvedParseTree(ast, plannerCtx)) 方法 //QB初始化部分
boolean genResolvedParseTree(ASTNode ast, PlannerContext plannerCtx) throws SemanticException {
ASTNode child = ast;
this.ast = ast;
viewsExpanded = new ArrayList<String>();
ctesExpanded = new ArrayList<String>();
//可以看出来,其实里面是匹配各种语句,create view,alter view,select 等
switch(ast.getToken().getType()) {
case HiveParser.TOK_SET_AUTOCOMMIT:
assert ast.getChildCount() == 1;
if(ast.getChild(0).getType() == HiveParser.TOK_TRUE) {
setAutoCommitValue(true);
}
else if(ast.getChild(0).getType() == HiveParser.TOK_FALSE) {
setAutoCommitValue(false);
}
else {
assert false : "Unexpected child of TOK_SET_AUTOCOMMIT: " + ast.getChild(0).getType();
}
........
}
doPhase1方法
public boolean doPhase1(ASTNode ast, QB qb, Phase1Ctx ctx_1, PlannerContext plannerCtx)
throws SemanticException {
//可以看到,其实里面是很多的case when 然后去解析AST树,把结果放到QB里面
boolean phase1Result = true;
QBParseInfo qbp = qb.getParseInfo();
boolean skipRecursion = false;
if (ast.getToken() != null) {
skipRecursion = true;
switch (ast.getToken().getType()) {
case HiveParser.TOK_SELECTDI:
qb.countSelDi();
case HiveParser.TOK_SELECT:
.......
case HiveParser.TOK_WHERE:
.......
case HiveParser.TOK_INSERT_INTO:
......
}
genOPTree方法
Operator sinkOp = genOPTree(ast, plannerCtx);//org.apache.hadoop.hive.ql.parse.CalcitePlanner 类里面有override 这个方法,所以进CalcitePlanner 的方法
ASTNode newAST = getOptimizedAST(); //直接报错出来 CBO failed, skipping CBO 450行
sinkOp = super.genOPTree(ast, plannerCtx);//父类的方法进去 593行
//回到org.apache.hadoop.hive.ql.parse.SemanticAnalyzer 类的genOPTree 方法
Operator genOPTree(ASTNode ast, PlannerContext plannerCtx) throws SemanticException {
ASTNode hintsList = new ArrayList<();
getHintsFromQB(qb, hintsList);
getQB().getParseInfo().setHintList(hintsList); //设置hints直接跳过
//真正执行方法了 这一块不想debug了 直接摆出来一个最终的结果吧 代码和上面的差不多 也是各种嵌套
return genPlan(qb);
}
除了看到最外层select 的类型和配置里面有一些输入输出类、目录
parentOperators 记录了当前FS[18] SEL[17] join[16] ... 一直到..[0]
搬砖多年终不得要领,遂载源码看之望得真经。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?