【GreatSQL优化器-17】DYNAMIC RANGE

【GreatSQL优化器-17】DYNAMIC RANGE

一、DYNAMIC RANGE介绍

GreatSQL 的优化器有一种扫描方式是动态范围扫描方式,类似于“已读乱回”模式,这种模式是在表有多个索引的情况下,对驱动表连接的时候部分选择索引的情况。优化器没有找到好的索引可以使用,但发现在知道前面表的列值后,可能会使用某些索引。对于前面表中的每个行组合,优化器检查是否可以使用 range 或 index merge 访问方法来检索行。虽然这不是很快,但比执行完全没有索引的连接要快。

下面用一个简单的例子来说明直方图怎么应用在优化器。

CREATE TABLE t1 (c1 INT PRIMARY KEY, c2 INT,date1 DATETIME);
INSERT INTO t1 VALUES (1,10,'2021-03-25 16:44:00.123456'),(2,1,'2022-03-26 16:44:00.123456'),(3,4,'2023-03-27 16:44:00.123456'),(5,5,'2024-03-25 16:44:00.123456'),(7,null,'2020-03-25 16:44:00.123456'),(8,10,'2020-10-25 16:44:00.123456'),(11,16,'2023-03-25 16:44:00.123456');
CREATE TABLE t2 (cc1 INT PRIMARY KEY, cc2 INT);
INSERT INTO t2 VALUES (1,3),(2,1),(3,2),(4,3),(5,15);
CREATE TABLE t3 (ccc1 INT, ccc2 varchar(100));
INSERT INTO t3 VALUES (1,'aa1'),(2,'bb1'),(3,'cc1'),(4,'dd1'),(null,'ee');
CREATE INDEX idx1 ON t1(c2);
CREATE INDEX idx2 ON t1(c2,date1);
CREATE INDEX idx2_1 ON t2(cc2);
CREATE INDEX idx3_1 ON t3(ccc1);
greatsql> EXPLAIN SELECT * FROM t1 join t3 ON t1.c1=t3.ccc1 or t1.c2<5;
+----+-------------+-------+------------+------+-------------------+------+---------+------+------+----------+------------------------------------------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+-------------+-------+------------+------+-------------------+------+---------+------+------+----------+------------------------------------------------+
| 1 | SIMPLE | t3 | NULL | ALL | idx3_1 | NULL | NULL | NULL | 5 | 100.00 | NULL |
| 1 | SIMPLE | t1 | NULL | ALL | PRIMARY,idx1,idx2 | NULL | NULL | NULL | 7 | 43.67 | Range checked for each record (index map: 0x7) |
-- 这里的结果出现了Range checked,说明进行了DYNAMIC_RANGE
+----+-------------+-------+------------+------+-------------------+------+---------+------+------+----------+------------------------------------------------+
greatsql> SELECT * FROM t1 join t3 ON t1.c1=t3.ccc1 or t1.c2<5;
+----+------+---------------------+------+------+
| c1 | c2 | date1 | ccc1 | ccc2 |
+----+------+---------------------+------+------+
| 1 | 10 | 2021-03-25 16:44:00 | 1 | aa1 |
| 2 | 1 | 2022-03-26 16:44:00 | 1 | aa1 |
| 3 | 4 | 2023-03-27 16:44:00 | 1 | aa1 |
| 2 | 1 | 2022-03-26 16:44:00 | 2 | bb1 |
| 3 | 4 | 2023-03-27 16:44:00 | 2 | bb1 |
| 2 | 1 | 2022-03-26 16:44:00 | 3 | cc1 |
| 3 | 4 | 2023-03-27 16:44:00 | 3 | cc1 |
| 2 | 1 | 2022-03-26 16:44:00 | 4 | dd1 |
| 3 | 4 | 2023-03-27 16:44:00 | 4 | dd1 |
| 2 | 1 | 2022-03-26 16:44:00 | NULL | ee |
| 3 | 4 | 2023-03-27 16:44:00 | NULL | ee |
+----+------+---------------------+------+------+
greatsql> SELECT * FROM INFORMATION_SCHEMA.OPTIMIZER_TRACE;
"attaching_conditions_to_tables": {
"original_condition": "((`t1`.`c1` = `t3`.`ccc1`) or (`t1`.`c2` < 5))",
"attached_conditions_computation": [
{
"table": "`t1`",
"rechecking_index_usage": { -- 这里对t1进行了recheck
"recheck_reason": "not_first_table",
"range_analysis": {
"table_scan": {
"rows": 7,
"cost": 3.05
},
"potential_range_indexes": [
{
"index": "PRIMARY",
"usable": true,
"key_parts": [
"c1"
]
},
{
"index": "idx1",
"usable": true,
"key_parts": [
"c2",
"c1"
]
},
{
"index": "idx2",
"usable": true,
"key_parts": [
"c2",
"date1",
"c1"
]
}
],
"best_covering_index_scan": {
"index": "idx2",
"cost": 0.952742,
"chosen": true
},
"setup_range_conditions": [
],
"group_index_range": {
"chosen": false,
"cause": "not_single_table"
},
"skip_scan_range": {
"chosen": false,
"cause": "not_single_table"
},
"analyzing_range_alternatives": {
"range_scan_alternatives": [
],
"analyzing_roworder_intersect": {
"usable": false,
"cause": "too_few_roworder_scans"
}
},
"analyzing_index_merge_union": [
{
"indexes_to_merge": [
{
"range_scan_alternatives": [
{
"index": "PRIMARY",
"chosen": false,
"cause": "depends_on_unread_values"
}
],
"chosen": false,
"cause": "cost"
},
{
"range_scan_alternatives": [
{
"index": "idx1",
"ranges": [
"NULL < c2 < 5"
],
"index_dives_for_eq_ranges": true,
"rowid_ordered": false,
"using_mrr": false,
"index_only": true,
"in_memory": 1,
"rows": 2,
"cost": 0.460274,
"chosen": true
},
{
"index": "idx2",
"ranges": [
"NULL < c2 < 5"
],
"index_dives_for_eq_ranges": true,
"rowid_ordered": false,
"using_mrr": false,
"index_only": true,
"in_memory": 1,
"rows": 2,
"cost": 0.460457,
"chosen": false,
"cause": "cost"
}
],
"chosen": false,
"cause": "cost"
}
],
"cost_of_reading_ranges": 0,
"chosen": false,
"cause": "cost"
}
]
}
}
}
],
"attached_conditions_summary": [
{
"table": "`t3`",
"attached": null
},
{
"table": "`t1`",
"attached": "((`t1`.`c1` = `t3`.`ccc1`) or (`t1`.`c2` < 5))"
}
]
}
"join_execution": {
"select#": 1,
"steps": [
{
"rows_estimation_per_outer_row": { -- 这里只截取t3读第一行时候t1的索引选择
"table": "`t1`",
"range_analysis": {
"table_scan": {
"rows": 7,
"cost": 3.05
},
"potential_range_indexes": [
{
"index": "PRIMARY",
"usable": true,
"key_parts": [
"c1"
]
},
{
"index": "idx1",
"usable": true,
"key_parts": [
"c2",
"c1"
]
},
{
"index": "idx2",
"usable": true,
"key_parts": [
"c2",
"date1",
"c1"
]
}
],
"best_covering_index_scan": {
"index": "idx2",
"cost": 0.952742,
"chosen": true
},
"setup_range_conditions": [
],
"group_index_range": {
"chosen": false,
"cause": "not_single_table"
},
"skip_scan_range": {
"chosen": false,
"cause": "not_single_table"
},
"analyzing_range_alternatives": {
"range_scan_alternatives": [
],
"analyzing_roworder_intersect": {
"usable": false,
"cause": "too_few_roworder_scans"
}
},
"analyzing_index_merge_union": [
{
"indexes_to_merge": [
{
"range_scan_alternatives": [
{
"index": "PRIMARY",
"ranges": [
"c1 = 1"
],
"index_dives_for_eq_ranges": true,
"rowid_ordered": true,
"using_mrr": false,
"index_only": true,
"in_memory": 1,
"rows": 1,
"cost": 0.36,
"chosen": true
}
],
"index_to_merge": "PRIMARY",
"cumulated_cost": 0.36
},
{
"range_scan_alternatives": [
{
"index": "idx1",
"ranges": [
"NULL < c2 < 5"
],
"index_dives_for_eq_ranges": true,
"rowid_ordered": false,
"using_mrr": false,
"index_only": true,
"in_memory": 1,
"rows": 2,
"cost": 0.460274,
"chosen": true
},
{
"index": "idx2",
"ranges": [
"NULL < c2 < 5"
],
"index_dives_for_eq_ranges": true,
"rowid_ordered": false,
"using_mrr": false,
"index_only": true,
"in_memory": 1,
"rows": 2,
"cost": 0.460457,
"chosen": false,
"cause": "cost"
}
],
"index_to_merge": "idx1",
"cumulated_cost": 0.820274
}
],
"cost_of_reading_ranges": 0.820274,
"cost_of_mapping_rowid_in_non_clustered_pk_scan": 0.1,
"cost_sort_rowid_and_read_disk": 0.4375,
"use_roworder_index_merge": true, -- 根据上面结果选择了主键合并idx1的结果
"cause": "cost"
}
]
}
}
},
greatsql> EXPLAIN SELECT * FROM t1 join t3 ON t1.c1=t3.ccc1 or t1.c2<5;
+----+-------------+-------+------------+------+-------------------+------+---------+------+------+----------+------------------------------------------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+-------------+-------+------------+------+-------------------+------+---------+------+------+----------+------------------------------------------------+
| 1 | SIMPLE | t3 | NULL | ALL | idx3_1 | NULL | NULL | NULL | 5 | 100.00 | NULL |
| 1 | SIMPLE | t1 | NULL | ALL | PRIMARY,idx1,idx2 | NULL | NULL | NULL | 7 | 42.85 | Range checked for each record (index map: 0x7) | -- 这里0x7指的是t1表的3条索引
+----+-------------+-------+------------+------+-------------------+------+---------+------+------+----------+------------------------------------------------+
-- 驱动表t3一共5条数据,每行记录都进行了一次test_quick_select()来查找t1是否有可用的更优索引

详细DYNAMIC RANGE执行过程如下表所示

t3.ccc1 t1全表扫描cost t1索引union t1索引sort-union的cost t1联合索引idx2 t1范围扫描 结果
1 3.8 排序[PRIMARY,idx1,idx2], idx2加进去的cost太大放弃 1.58027 1.71097 PRIMARY,idx1索引合并
2 3.8 排序[PRIMARY,idx1,idx2], idx2的cost太大放弃 1.58027 1.71097 PRIMARY,idx1索引合并
3 3.8 排序[PRIMARY,idx1,idx2], idx2的cost太大放弃 1.58027 1.71097 PRIMARY,idx1索引合并
4 3.8 排序[PRIMARY,idx1,idx2], idx2的cost太大放弃 1.58027 1.71097 PRIMARY,idx1索引合并
null 3.8 排序[idx1,idx2],cost太大, 不用合并 idx1的cost太大,不用合并 1.71097 idx2 1.21183 索引idx2

二、相关代码解释

DYNAMIC_RANGE 的判断在对表join排序以后执行make_join_query_block的时候,这时候要对非 join 第一张表做 recheck,recheck 以后有可能让表变为 DYNAMIC_RANGE 扫描方式。

static bool make_join_query_block(JOIN *join, Item *cond) {
// 满足以下条件的需要进行RECHECK,要么是非第一张表要么是有limit语句
if ((tab->type() == JT_ALL || tab->type() == JT_RANGE ||
tab->type() == JT_INDEX_MERGE || tab->type() == JT_INDEX_SCAN) &&
tab->use_quick != QS_RANGE) {
if (cond && // 1a
(tab->keys() != tab->const_keys) && // 1b
(i > 0 || // 1c
(join->query_block->master_query_expression()->item &&
cond->is_outer_reference())))
recheck_reason = NOT_FIRST_TABLE;
else if (!tab->const_keys.is_clear_all() && // 2a
i == join->const_tables && // 2b
(join->query_expression()->select_limit_cnt <
(tab->position()->rows_fetched *
tab->position()->filter_effect)) && // 2c
!join->calc_found_rows) // 2d
recheck_reason = LOW_LIMIT;
// 满足这个if条件的执行QS_DYNAMIC_RANGE,详情见下面几张表
if (!tab->table()->quick_keys.is_subset(tab->checked_keys) ||
!tab->needed_reg.is_subset(tab->checked_keys)) {
tab->keys().merge(tab->table()->quick_keys);
tab->keys().merge(tab->needed_reg);
if (!tab->needed_reg.is_clear_all() &&
(tab->table()->quick_keys.is_clear_all() ||
(tab->range_scan() &&
(tab->range_scan()->num_output_rows() >= 100.0)))) {
tab->use_quick = QS_DYNAMIC_RANGE;
tab->set_type(JT_ALL);
} else
tab->use_quick = QS_RANGE;
}
}
}
// 实际使用代码
bool DynamicRangeIterator::Init() {
// 每次迭代器开始的时候先获取最佳索引组合和AccessPath
AccessPath *range_scan;
int rc = test_quick_select(&range_scan);
}

表一:需要进行 recheck 的原因

场景 条件
非join表连接的第一张表 存在针对这张表的条件
连接的第一张表 没有第一张表的列与常量值的比较,limit值小于预估的满足条件的行数

表二:quick_type 类型

类型 说明
QS_NONE 不使用快速扫描
QS_RANGE 使用范围扫描
QS_DYNAMIC_RANGE 使用动态范围

表三:表用 QS_DYNAMIC_RANGE 的场合

条件(以下必须全部满足) 说明
条件列存在索引 有机会生成mm tree
keyuse_array数组没有值 有值的话执行REF扫描,一般有OR条件时候导致该数组没有值
表没有对应的mm tree 无法生成mm tree的条件
这张表不是join连接的第一张表
对表执行recheck recheck的原因见表一
recheck以后生成AccessPath但是找到的范围内行数大于等于100行

三、实际例子说明

接下来看几个例子来说明上面的代码:

greatsql> EXPLAIN SELECT * FROM t1 join t3 ON t1.c1=t3.ccc1 or t1.c2<5;
+----+-------------+-------+------------+------+-------------------+------+---------+------+------+----------+------------------------------------------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+-------------+-------+------------+------+-------------------+------+---------+------+------+----------+------------------------------------------------+
| 1 | SIMPLE | t3 | NULL | ALL | idx3_1 | NULL | NULL | NULL | 5 | 100.00 | NULL |
| 1 | SIMPLE | t1 | NULL | ALL | PRIMARY,idx1,idx2 | NULL | NULL | NULL | 7 | 43.67 | Range checked for each record (index map: 0x7) |
+----+-------------+-------+------------+------+-------------------+------+---------+------+------+----------+------------------------------------------------+
-- 第一次对单表进行估计的时候t1.c1=t3.ccc1条件没有加入到mm tree,只对t1.c1打了一个标记MAYBE_KEY表示后面可能可以用到这个索引,因此最后没有生成mm tree,在make_join_query_block因为已经确定表连接顺序可以使用t1.c1=t3.ccc1条件,这时候可以把两个条件结合起来对t1执行mm tree,因此这个时候才生成mm tree,而这个时候才能考虑range scan和索引合并等操作。

image-20250303143457768

t1 的 JOIN_TAB 的两个相关 key 变量,因此 keys() != const_keys

key idx2(c2,date1) idx1(c2) primary(c1) 说明
keys() 1 1 1 用来存放所有条件列涉及的key,这里指t1所有索引
const_keys 1 1 0 用来存放所有列与常量对比的key,这里指t1.c2<5

下面删除t1的一些数据,让join方式变为t1在前t3在后,这样的话对t1就不需要执行recheck,并且最后t1也通过索引进行扫描。

greatsql> DELETE FROM t1 WHERE c1 IN (7,8,11);
greatsql> EXPLAIN SELECT * FROM t1 JOIN t3 ON t1.c1=t3.ccc1 OR t1.c2<5;
+----+-------------+-------+------------+-------+-------------------+------+---------+------+------+----------+--------------------------------------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+-------------+-------+------------+-------+-------------------+------+---------+------+------+----------+--------------------------------------------+
| 1 | SIMPLE | t1 | NULL | index | PRIMARY,idx1,idx2 | idx2 | 11 | NULL | 3 | 100.00 | Using index |
| 1 | SIMPLE | t3 | NULL | ALL | idx3_1 | NULL | NULL | NULL | 10 | 100.00 | Using where; Using join buffer (hash join) |
+----+-------------+-------+------------+-------+-------------------+------+---------+------+------+----------+--------------------------------------------+

下面执行一下三张表的连接,看到因为t1不是第一张表,最后用到了DYNAMIC RANGE

greatsql> INSERT INTO t1 VALUES (7,null,'2020-03-25 16:44:00.123456'),(8,10,'2020-10-25 16:44:00.123456'),(11,16,'2023-03-25 16:44:00.123456');
greatsql> EXPLAIN SELECT * FROM t1 join t3 join t2 ON t1.c1=t3.ccc1 and t1.c1=t2.cc1 or t1.c2<5 ;
+----+-------------+-------+------------+-------+-------------------+--------+---------+------+------+----------+------------------------------------------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+-------------+-------+------------+-------+-------------------+--------+---------+------+------+----------+------------------------------------------------+
| 1 | SIMPLE | t2 | NULL | index | PRIMARY | idx2_1 | 5 | NULL | 5 | 100.00 | Using index |
| 1 | SIMPLE | t1 | NULL | ALL | PRIMARY,idx1,idx2 | NULL | NULL | NULL | 7 | 42.85 | Range checked for each record (index map: 0x7) |
| 1 | SIMPLE | t3 | NULL | ALL | idx3_1 | NULL | NULL | NULL | 5 | 100.00 | Using where; Using join buffer (hash join) |
+----+-------------+-------+------------+-------+-------------------+--------+---------+------+------+----------+------------------------------------------------+

如果遇到 DYNAMIC RANGE 的情况,每行进行索引判断肯定比一开始就决定单纯走索引浪费时间,为了避免这种情况,可以采取以下的方法来避免。

1、强制让 DYNAMIC RANGE 的表作为驱动表

greatsql> EXPLAIN SELECT /*+ qb_name(qb1) JOIN_ORDER(@qb1 t1,t2,t3) */ * FROM t1 join t3 join t2 ON t1.c1=t3.ccc1 and t1.c1=t2.cc1 or t1.c2<5 ;
+----+-------------+-------+------------+-------+-------------------+--------+---------+------+------+----------+---------------------------------------------------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+-------------+-------+------------+-------+-------------------+--------+---------+------+------+----------+---------------------------------------------------------+
| 1 | SIMPLE | t1 | NULL | index | PRIMARY,idx1,idx2 | idx2 | 11 | NULL | 7 | 100.00 | Using index |
| 1 | SIMPLE | t2 | NULL | index | PRIMARY | idx2_1 | 5 | NULL | 5 | 100.00 | Using where; Using index; Using join buffer (hash join) |
| 1 | SIMPLE | t3 | NULL | ALL | idx3_1 | NULL | NULL | NULL | 5 | 100.00 | Using where; Using join buffer (hash join) |
+----+-------------+-------+------------+-------+-------------------+--------+---------+------+------+----------+---------------------------------------------------------+

2、给 DYNAMIC RANGE 的表强制指定索引

greatsql> EXPLAIN SELECT * FROM t1 FORCE INDEX(idx1,idx2) join t3 join t2 ON t1.c1=t3.ccc1 and t1.c1=t2.cc1 or t1.c2<5 ;
+----+-------------+-------+------------+-------+---------------+--------+---------+------+------+----------+---------------------------------------------------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+-------------+-------+------------+-------+---------------+--------+---------+------+------+----------+---------------------------------------------------------+
| 1 | SIMPLE | t2 | NULL | index | PRIMARY | idx2_1 | 5 | NULL | 5 | 100.00 | Using index |
| 1 | SIMPLE | t1 | NULL | index | idx1,idx2 | idx2 | 11 | NULL | 7 | 42.85 | Using where; Using index; Using join buffer (hash join) |
| 1 | SIMPLE | t3 | NULL | ALL | idx3_1 | NULL | NULL | NULL | 5 | 100.00 | Using where; Using join buffer (hash join) |
+----+-------------+-------+------------+-------+---------------+--------+---------+------+------+----------+---------------------------------------------------------+

3、改变条件,删除or条件,使用 keyuse_array 数组来选择路径。

greatsql> EXPLAIN SELECT * FROM t1 join t3 join t2 ON t1.c1=t3.ccc1 and t1.c1=t2.cc1;
+----+-------------+-------+------------+--------+---------------+---------+---------+------------+------+----------+-------------+
| id | select_type | table | partitions | type | possible_keys | key | key_len | ref | rows | filtered | Extra |
+----+-------------+-------+------------+--------+---------------+---------+---------+------------+------+----------+-------------+
| 1 | SIMPLE | t2 | NULL | index | PRIMARY | idx2_1 | 5 | NULL | 5 | 100.00 | Using index |
| 1 | SIMPLE | t3 | NULL | ref | idx3_1 | idx3_1 | 5 | db1.t2.cc1 | 1 | 100.00 | NULL |
| 1 | SIMPLE | t1 | NULL | eq_ref | PRIMARY | PRIMARY | 4 | db1.t2.cc1 | 1 | 100.00 | NULL |
+----+-------------+-------+------------+--------+---------------+---------+---------+------------+------+----------+-------------+

四、总结

从上面关于 DYNAMIC RANGE 的解释我们知道这个状态需要经过一系列判断,并且只在特定条件才有可能出现,但是每行进行一次索引判断还是很消耗资源的,最好还是直接走索引,所以要尽量避免这种情况,适当的时候可以进行 SQL 命令干预。


Enjoy GreatSQL 😃

关于 GreatSQL

GreatSQL是适用于金融级应用的国内自主开源数据库,具备高性能、高可靠、高易用性、高安全等多个核心特性,可以作为MySQL或Percona Server的可选替换,用于线上生产环境,且完全免费并兼容MySQL或Percona Server。

相关链接: GreatSQL社区 Gitee GitHub Bilibili

GreatSQL社区:

社区博客有奖征稿详情:https://greatsql.cn/thread-100-1-1.html

image-20230105161905827

技术交流群:

微信:扫码添加GreatSQL社区助手微信好友,发送验证信息加群

image-20221030163217640

posted @   GreatSQL  阅读(7)  评论(0编辑  收藏  举报
点击右上角即可分享
微信分享提示