postgresql宕机排查
postgresql-性能环境宕机排查
问题背景
测试人员在性能测试时,给数据库加压数据过程中服务器宕机,启动数据库十余分钟后还是显示数据库一直在启动中。
由于当时周末,我在外,没有带笔记本。听到这个消息后,首先询问了启动日志是否有异常,反馈是没有错误日志,日志打印了很多启动中。
猜测
数据库一直启动中,是否由于事务日志需要回滚导致的呢?
让测试查看了pg_xlog日志文件大小,居然有90GB+
。
pg_xlog简介
abase3.6.2目录中有三类日志文件,pg_log
记录日志文件,pg_xlog
记录事务日志,pg_clog
记录事务提交日志
abase5.0以后为了防止误删三类日志文件分别对应:pg_log->log,pg_xlog->pg_wal,pg_clog->pg_xact
正常情况下pg_xlog
的大小应该是小于max_wal_size=1GB
如果一直在启动中说明在回滚日志,只需要等待即可。
宕机问题排查
听到宕机后,我也火速回家处理问题
按照正常的磁盘读写速度,90GB
也用不了多久,但是却还原了两个多小时
查看磁盘读写速度只有7MB/S
,而磁盘的io
负荷已经100%
最终排查出磁盘读写慢的主要问题是,服务器在前几天扩容时,扩的是走网络存储
,而不是物理存储
,导致读写慢。
而生成大量日志的原因是因为数据库在加压,加压函数慢,大量未提交的事务,导致生成大量的wal日志
数据库宕机异常日志
database system was not properly shut down; automatic recovery in progress
redo starts at 4F1/AB018A90
ending log output to stderr Future log output will go to log destination "csvlog".
database system was interrupted while in recovery at 2019-05-26 13:19:42 CST This probably means that some data is corrupted and you will have to use the last backup for recovery.
MultiXact member wraparound protections are now enabled
database system is ready to accept connections
autovacuum launcher started
网络存储
物理存储为datastore,网络存储为target,网络存储较慢
加压函数
大量类似的加压函数,这里选取一条以作说明
CREATE OR REPLACE FUNCTION "db_scld"."abase2019rycgjl"(cyclecount int4)
RETURNS "pg_catalog"."varchar" AS $BODY$
declare
v_db_scld_t_scld_rycgjl_c_bh bpchar;
v_db_scld_t_scld_rycgjl_c_cprscjlbh bpchar;
v_db_scld_t_scld_rycgjl_c_jdrybm varchar(300);
v_db_scld_t_scld_rycgjl_c_jdryxm varchar(300);
v_db_scld_t_scld_rycgjl_d_cgrq date;
v_db_scld_t_scld_rycgjl_c_cgqk varchar(300);
v_db_scld_t_scld_rycgjl_n_gzgs numeric;
v_db_scld_t_scld_rycgjl_c_sfyx varchar(300);
v_db_scld_t_scld_rycgjl_c_cjr varchar(300);
v_db_scld_t_scld_rycgjl_c_gxr varchar(300);
v_db_scld_t_scld_rycgjl_dt_crsj timestamp;
v_db_scld_t_scld_rycgjl_dt_gxsj timestamp;
v_db_scld_t_scld_rycgjl_c_cjbm varchar(300);
v_db_scld_t_scld_rycgjl_c_cjdw varchar(300);
v_s varchar(300);
v_scdd varchar(300);
BEGIN
for i in 1.. cycleCount loop
v_db_scld_t_scld_rycgjl_c_cprscjlbh :=(select C_BH from db_scld.t_scld_cprscjl order by random() LIMIT 1 );
v_scdd := (select c_cjbm from db_scld.t_scld_cprscjl where c_bh= v_db_scld_t_scld_rycgjl_c_cprscjlbh );
for j in 1 .. 144 loop
v_db_scld_t_scld_rycgjl_c_bh :=get_UUID(32,32);
v_db_scld_t_scld_rycgjl_c_jdrybm :=(select c_jdrybm from db_scld.t_jdry
where c_bmbm = v_scdd and c_sfyx ='1' and c_ryzszt not in ('05','12','11','07','09','13') order by random() limit 1 );
v_db_scld_t_scld_rycgjl_c_jdryxm :=(select c_xm from db_scld.t_jdry where c_jdrybm = v_db_scld_t_scld_rycgjl_c_jdrybm);
v_db_scld_t_scld_rycgjl_d_cgrq :=(select d_scrq from db_scld.T_SCLD_CPRSCJL where C_BH = v_db_scld_t_scld_rycgjl_c_cprscjlbh);
v_db_scld_t_scld_rycgjl_c_cgqk :=get_split_str('01');
v_db_scld_t_scld_rycgjl_n_gzgs :=get_rand_int(8,8);
v_db_scld_t_scld_rycgjl_c_sfyx :=get_split_str('1');
v_db_scld_t_scld_rycgjl_c_cjr :=(select C_CJR from db_scld.T_SCLD_CPRSCJL where C_BH = v_db_scld_t_scld_rycgjl_c_cprscjlbh and d_scrq= v_db_scld_t_scld_rycgjl_d_cgrq);
v_db_scld_t_scld_rycgjl_c_gxr :=v_db_scld_t_scld_rycgjl_c_cjr;
v_db_scld_t_scld_rycgjl_dt_crsj :=v_db_scld_t_scld_rycgjl_d_cgrq;
v_db_scld_t_scld_rycgjl_dt_gxsj :=v_db_scld_t_scld_rycgjl_dt_crsj;
v_db_scld_t_scld_rycgjl_c_cjbm :=(select C_CJBM from db_scld.T_SCLD_CPRSCJL where C_BH = v_db_scld_t_scld_rycgjl_c_cprscjlbh);
v_db_scld_t_scld_rycgjl_c_cjdw :=(select left(v_db_scld_t_scld_rycgjl_c_cjbm,6));
/*==人员出工记录 (db_scld.t_scld_rycgjl) ==*/
INSERT INTO db_scld.t_scld_rycgjl(c_bh,c_cprscjlbh,c_jdrybm,c_jdryxm,d_cgrq,c_cgqk,n_gzgs,c_sfyx,c_cjr,c_gxr,dt_crsj,dt_gxsj,c_cjbm,c_cjdw)
VALUES (v_db_scld_t_scld_rycgjl_c_bh,v_db_scld_t_scld_rycgjl_c_cprscjlbh,v_db_scld_t_scld_rycgjl_c_jdrybm,v_db_scld_t_scld_rycgjl_c_jdryxm,v_db_scld_t_scld_rycgjl_d_cgrq,v_db_scld_t_scld_rycgjl_c_cgqk,v_db_scld_t_scld_rycgjl_n_gzgs,v_db_scld_t_scld_rycgjl_c_sfyx,v_db_scld_t_scld_rycgjl_c_cjr,v_db_scld_t_scld_rycgjl_c_gxr,v_db_scld_t_scld_rycgjl_dt_crsj,v_db_scld_t_scld_rycgjl_dt_gxsj,v_db_scld_t_scld_rycgjl_c_cjbm,v_db_scld_t_scld_rycgjl_c_cjdw);
end loop;
end loop;
RETURN 'success';
end;
$BODY$
LANGUAGE 'plpgsql' VOLATILE COST 100
;
ALTER FUNCTION "db_scld"."abase2019rycgjl"(cyclecount int4) OWNER TO "test";
查看表信息
、、、
Indexes:
"pk_scld_rycgjl" UNIQUE, btree (c_bh)
"i_t_scld_rycgjl_c_cprscjlbh" btree (c_cprscjlbh)
"i_t_scld_rycgjl_c_gwbh" btree (c_gwbh)
"i_t_scld_rycgjl_d_cgrq" btree (d_cgrq)
建议在加压过程中去掉索引
加压函数优化
该函数主要是向db_scld.t_scld_rycgjl
表中构造数据,其中包含了一些查询
加压一条数据大约耗时1s
,这个速度已经非常慢了。单条sql执行如果为千分之一秒,那么一秒钟最多也就插入一千条数据
慢的原因主要是使用random()随机获取一条记录时慢
select C_BH from db_scld.t_scld_cprscjl order by `random()` LIMIT 1;
select c_jdrybm from db_scld.t_jdry
where c_bmbm = v_scdd and c_sfyx ='1' and c_ryzszt not in ('05','12','11','07','09','13') order by `random()` limit 1
db_jdsjpt=# explain analyze select C_BH from db_scld.t_scld_cprscjl order by random() LIMIT 1;
QUERY PLAN
---------------------------------------------------------------------------------------------------
Limit (cost=61029.94..61029.94 rows=1 width=41) (actual time=587.193..587.193 rows=1 loops=1)
-> Sort (cost=61029.94..63172.22 rows=856911 width=41) (actual time=587.185..587.185 rows=1 loops=1)
Sort Key: (random())
Sort Method: top-N heapsort Memory: 25kB
-> Seq Scan on t_scld_cprscjl (cost=0.00..56745.39 rows=856911 width=41) (actual time=0.019..380.139 rows=854682 loop
s=1)
Planning time: 1.179 ms
Execution time: 587.242 ms
(7 rows)
--表总数量
db_jdsjpt=# select count(*) from db_scld.t_scld_cprscjl;
count
--------
854682
(1 row)
随机获取一条记录random()
random()
耗时:Time: 389.818 ms
--随机获取一条耗时
db_jdsjpt=# select C_BH from db_scld.t_scld_cprscjl order by random() LIMIT 1;
c_bh
----------------------------------
6d861b011c854040bf5b731f49d40b48
(1 row)
Time: 389.818 ms
改写1
offset
耗时:Time: 60.022 ms
--offset可以走索引,少了排序操作
db_jdsjpt=# select C_BH from db_scld.t_scld_cprscjl offset floor(random()*854682) LIMIT 1;
c_bh
----------------------------------
f90301bd8ac2485196ffae32ee70345c
(1 row)
Time: 60.022 ms
db_jdsjpt=# explain analyze select C_BH from db_scld.t_scld_cprscjl offset floor(random()*854682) LIMIT 1;
QUERY PLAN
---------------------------------------------------------------------------------------------------
Limit (cost=3747.64..3747.68 rows=1 width=33) (actual time=30.758..30.759 rows=1 loops=1)
-> Index Only Scan using i_corscjl_cprscbh_ on t_scld_cprscjl (cost=0.42..37472.65 rows=854682 width=33) (actual time=0.
047..25.808 rows=81993 loops=1)
Heap Fetches: 0
Planning time: 0.228 ms
Execution time: 30.802 ms
(5 rows)
Time: 31.779 ms
改写2
pg从9.5开始提供抽样函数
使用tablesample抽样的过程中比例不能太低,否则可能获取不到结果,且不能带有过滤条件
system
耗时: Time: 0.639 ms
system
:随机性较差,效率高
--改写后耗时
db_jdsjpt=# select c_bh from db_scld.t_scld_cprscjl tablesample system(0.1) limit 1;
c_bh
----------------------------------
e2fce25399db42f0bf49faf8e7214d5f
(1 row)
Time: 0.639 ms
--system随机抽样以块为单位所以更快
db_jdsjpt=# explain analyze select c_bh from db_scld.t_scld_cprscjl tablesample system(0.1) limit 1;
QUERY PLAN
---------------------------------------------------------------------------------------------------
Limit (cost=0.00..0.23 rows=1 width=33) (actual time=0.105..0.105 rows=1 loops=1)
-> Sample Scan on t_scld_cprscjl (cost=0.00..192.55 rows=855 width=33) (actual time=0.102..0.102 rows=1 loops=1)
Sampling: system ('0.1'::real)
Planning time: 0.190 ms
Execution time: 0.134 ms
(5 rows)
Time: 1.182 ms
改写3
bernoulli
:随机性更好,但效率比system要低
bernoullih
耗时:Time: 0.822 ms
db_jdsjpt=# select c_bh from db_scld.t_scld_cprscjl tablesample bernoulli(0.1) limit 1;
c_bh
----------------------------------
7ec30761ffd04bd9ad77797a33645a84
(1 row)
Time: 0.822 ms
--bernoulli以行为单位进行抽样,比system效率低点
db_jdsjpt=# explain analyze select c_bh from db_scld.t_scld_cprscjl tablesample bernoulli(0.1) limit 1;
QUERY PLAN
---------------------------------------------------------------------------------------------------
Limit (cost=0.00..53.85 rows=1 width=33) (actual time=1.410..1.411 rows=1 loops=1)
-> Sample Scan on t_scld_cprscjl (cost=0.00..46042.55 rows=855 width=33) (actual time=1.408..1.408 rows=1 loops=1)
Sampling: bernoulli ('0.1'::real)
Planning time: 0.446 ms
Execution time: 1.436 ms
(5 rows)
Time: 25.770 ms
同理另外一条sql也可用同样的方式,且在c_bmbm
字段上面加上索引
当有条件的时候可以使用offset
获取,offset
的值也可以通过for
循环传入
db_jdsjpt=# select count(*) from db_scld.t_jdry;
count
--------
214819
(1 row)
db_jdsjpt=# select c_jdrybm from db_scld.t_jdry where c_bmbm = '4402222804' and c_sfyx ='1' and c_ryzszt not in ('05','12','11','07','09','13') offset floor(random()*214819) limit 1;
c_jdrybm
----------
(0 rows)
Time: 1.924 ms
对比
方法 | 耗时 |
---|---|
order by random() | 389.818 ms |
offset n | 60.022 ms-240ms |
system() | 0.639 ms |
bernoulli() | 0.822 ms |
使用offset
的时候和n的大小有关系,当n越大,扫描的索引块越多,就越大,但是相对于order by random()
的方式仍然要快。
注意
system(0.1)
等于百分之零点一,也就是抽样千分之一 854682*0.001=854
,大概每次抽取854
条记录
--system
db_jdsjpt=# select count(*) from db_scld.t_scld_cprscjl tablesample system(0.1) ;
count
-------
592
(1 row)
Time: 1.499 ms
--bernoulli
db_jdsjpt=# select count(*) from db_scld.t_scld_cprscjl tablesample bernoulli(0.1) ;
count
-------
840
(1 row)
Time: 86.037 ms
这里可以看出bernoulli效率比system要低
优化后
CREATE OR REPLACE FUNCTION "db_scld"."abase2019rycgjl_zfx"(cyclecount int4)
RETURNS "pg_catalog"."varchar" AS $BODY$
declare
v_db_scld_t_scld_rycgjl_c_bh bpchar;
v_db_scld_t_scld_rycgjl_c_cprscjlbh bpchar;
v_db_scld_t_scld_rycgjl_c_jdrybm varchar(300);
v_db_scld_t_scld_rycgjl_c_jdryxm varchar(300);
v_db_scld_t_scld_rycgjl_d_cgrq date;
v_db_scld_t_scld_rycgjl_c_cgqk varchar(300);
v_db_scld_t_scld_rycgjl_n_gzgs numeric;
v_db_scld_t_scld_rycgjl_c_sfyx varchar(300);
v_db_scld_t_scld_rycgjl_c_cjr varchar(300);
v_db_scld_t_scld_rycgjl_c_gxr varchar(300);
v_db_scld_t_scld_rycgjl_dt_crsj timestamp;
v_db_scld_t_scld_rycgjl_dt_gxsj timestamp;
v_db_scld_t_scld_rycgjl_c_cjbm varchar(300);
v_db_scld_t_scld_rycgjl_c_cjdw varchar(300);
v_s varchar(300);
v_scdd varchar(300);
BEGIN
for i in 1.. cycleCount loop
v_db_scld_t_scld_rycgjl_c_cprscjlbh :=(select C_BH from db_scld.t_scld_cprscjl tablesample system(0.1) LIMIT 1);
v_scdd := (select c_cjbm from db_scld.t_scld_cprscjl where c_bh= v_db_scld_t_scld_rycgjl_c_cprscjlbh );
for j in 1 .. 144 loop
v_db_scld_t_scld_rycgjl_c_bh := "public".get_UUID(32,32);
v_db_scld_t_scld_rycgjl_c_jdrybm :=(select c_jdrybm from db_scld.t_jdry
where c_bmbm = v_scdd and c_sfyx ='1' and c_ryzszt not in ('05','12','11','07','09','13') offset j-1 limit 1 );
v_db_scld_t_scld_rycgjl_c_jdryxm :=(select c_xm from db_scld.t_jdry where c_jdrybm = v_db_scld_t_scld_rycgjl_c_jdrybm);
v_db_scld_t_scld_rycgjl_d_cgrq :=(select d_scrq from db_scld.T_SCLD_CPRSCJL where C_BH = v_db_scld_t_scld_rycgjl_c_cprscjlbh);
v_db_scld_t_scld_rycgjl_c_cgqk :=get_split_str('01');
v_db_scld_t_scld_rycgjl_n_gzgs :=get_rand_int(8,8);
v_db_scld_t_scld_rycgjl_c_sfyx :=get_split_str('1');
v_db_scld_t_scld_rycgjl_c_cjr :=(select C_CJR from db_scld.T_SCLD_CPRSCJL where C_BH = v_db_scld_t_scld_rycgjl_c_cprscjlbh and d_scrq= v_db_scld_t_scld_rycgjl_d_cgrq);
v_db_scld_t_scld_rycgjl_c_gxr :=v_db_scld_t_scld_rycgjl_c_cjr;
v_db_scld_t_scld_rycgjl_dt_crsj :=v_db_scld_t_scld_rycgjl_d_cgrq;
v_db_scld_t_scld_rycgjl_dt_gxsj :=v_db_scld_t_scld_rycgjl_dt_crsj;
v_db_scld_t_scld_rycgjl_c_cjbm :=(select C_CJBM from db_scld.T_SCLD_CPRSCJL where C_BH = v_db_scld_t_scld_rycgjl_c_cprscjlbh);
v_db_scld_t_scld_rycgjl_c_cjdw :=(select left(v_db_scld_t_scld_rycgjl_c_cjbm,6));
/*==人员出工记录 (db_scld.t_scld_rycgjl) ==*/
INSERT INTO db_scld.t_scld_rycgjl_zfx(c_bh,c_cprscjlbh,c_jdrybm,c_jdryxm,d_cgrq,c_cgqk,n_gzgs,c_sfyx,c_cjr,c_gxr,dt_crsj,dt_gxsj,c_cjbm,c_cjdw)
VALUES (v_db_scld_t_scld_rycgjl_c_bh,v_db_scld_t_scld_rycgjl_c_cprscjlbh,v_db_scld_t_scld_rycgjl_c_jdrybm,v_db_scld_t_scld_rycgjl_c_jdryxm,v_db_scld_t_scld_rycgjl_d_cgrq,v_db_scld_t_scld_rycgjl_c_cgqk,v_db_scld_t_scld_rycgjl_n_gzgs,v_db_scld_t_scld_rycgjl_c_sfyx,v_db_scld_t_scld_rycgjl_c_cjr,v_db_scld_t_scld_rycgjl_c_gxr,v_db_scld_t_scld_rycgjl_dt_crsj,v_db_scld_t_scld_rycgjl_dt_gxsj,v_db_scld_t_scld_rycgjl_c_cjbm,v_db_scld_t_scld_rycgjl_c_cjdw);
end loop;
end loop;
RETURN 'success';
end;
$BODY$
LANGUAGE 'plpgsql' VOLATILE COST 100
;
ALTER FUNCTION "db_scld"."abase2019rycgjl_zfx"(cyclecount int4) OWNER TO "test";
--执行1000次
db_jdsjpt=# select "db_scld"."abase2019rycgjl_zfx"(1000);
abase2019rycgjl_zfx
---------------------
success
(1 row)
Time: 88489.060 ms
--88s一共插入14w条数据
db_jdsjpt=# select count(*) from db_scld.t_scld_rycgjl_zfx;
count
--------
144000
(1 row)
Time: 29.704 ms
结语
1.公司许多服务器存储划分的是网络存储
,数据库禁止使用网络存储
2.数据加压过程中,应删除索引,待数据构造完毕在建索引
3.随机获取表中的一条数据,当表中数据较小时使用random感觉不明显,当数据量大时random由于每次都需要排序操作,导致随机获取一条的成本较高
4.随机获取一条记录可以使用limit 1 offset n-1的方式,或者使用随机抽样的方式
5.无论是使用limit 1 offset n还是使用tablesample随机抽样方式都需要知道表中的数据量,不能超过表数据量