记一次SQLServer复制监控器(replication monitor)复制延迟数值为NULL的异常处理
现象
在SQLServer复制(订阅发布),在正常运行的情况下,发布节点一直有写入,订阅节点也正常复制到了这些数据,但分发节点的复制监控器面板(replication monitor)无法看到部分发布对象的延迟信息。如下,经过重启SQLServer服务,重启SQLServer Agent服务,重启操作系统等尝试后,均无效,依旧显示不出来复制的latency信息。
查询复制监控器面板后台的存储过程sp_replmonitorhelpsubscription(本质上上述图形界面显示的就是这个系统存储过程的结果),同样地,latency字段为NULL。
这就有点讲不通了,因为复制是正常的,发布节点一直有写入,订阅节点也正常复制到了这些数据,不管延迟或大或小,正常情况下延迟总是一个大于等于0的数字,这里为什么会是NULL?
分析
查询一下相关的分发历史记录:问题开始出现端倪,发布历史日志显示的时间是未来一个很远的时间(当前时间是20231115),这明显不符合逻辑,问题很可能就在这里。
经询问是某个人因测试修改了服务器的系统时间,测试完成后又修改了回来。
大胆地猜测一下:
延迟信息是根据发布的历史执行时间跟当前时间对比,然后经过某种计算而得到的,由于修改了服务器时间,在生成的分发日志表的时间是“修改服务器时间之后的当前时间”,服务器时间修改过来之后,发布的历史记录的执行时间是一个将来的时间,那么计算出来的延迟就是一个超前的时间了,所以也就无法显示了
那么来看一下sp_replmonitorhelpsubscription是怎么计算的延迟时间的,从distrubution库扒出这个存储过程的代码,诡异的是,这个存储过程定义了临时表,#tmp_replication_monitordata,却没有赋值???????有兴趣的小伙伴可以看看这个临时表是怎么初始化数据进去的?
USE [distribution] GO SET ANSI_NULLS OFF GO SET QUOTED_IDENTIFIER OFF GO ALTER procedure [sys].[sp_replmonitorhelpsubscription] ( @publisher sysname = NULL -- pubisher - cannot be null for sql server 2008 or before. for SQL 11 or later, means all publishers ,@publisher_db sysname = NULL -- NULL for wildcard entry ,@publication sysname = NULL -- NULL for wildcard entry ,@publication_type int = NULL -- cannot be null in wildcard entry ,@mode int = 0 -- various filter modes ,@topnum int = NULL -- select only top topnum rows ,@exclude_anonymous bit = 0 -- exclude anonymous subscribers ,@refreshpolicy tinyint = 0 -- 0 = default cache refresh, 1 = optimistic force refresh, 2 = non-optimistic force refresh ) as begin set nocount on declare @retcode int ,@curdate datetime -- -- constants -- declare @expiration int ,@latency int ,@distribution_agentname sysname ,@mergeexpirationChosen int ,@mergerundurationChosen int ,@mergestoppedcontinuousagentChosen int ,@mergerunspeedChosen int ,@modeallsubscription tinyint ,@modeinerroronly tinyint ,@modeinwarningonly tinyint ,@modeinerrorandwarningonly tinyint ,@modetop25worstperforming tinyint ,@modetop50worstperforming tinyint ,@modesynchronizing tinyint ,@modenotsynchronizing tinyint -- -- initialize constants -- select @expiration = 1 ,@latency = 2 ,@mergeexpirationChosen=4 ,@mergerundurationChosen=5 ,@mergestoppedcontinuousagentChosen=6 ,@mergerunspeedChosen=7 ,@modeallsubscription = 0 ,@modeinerroronly = 1 ,@modeinwarningonly = 2 ,@modeinerrorandwarningonly = 3 ,@modetop25worstperforming = 4 ,@modetop50worstperforming = 5 ,@modesynchronizing = 6 ,@modenotsynchronizing = 7 -- -- security check : replmonitor -- if not (is_member(N'db_owner') = 1 or isnull(is_member(N'replmonitor'),0) = 1) begin raiserror(14260, 16, -1) return (1) end -- -- security: Has to be executed from distribution database -- if sys.fn_MSrepl_isdistdb (db_name()) != 1 begin raiserror (21482, 16, -1, 'sp_replmonitorhelpsubscription', 'distribution') return 1 end -- -- validate @publication_type -- if (@publisher_db is NULL or @publication is NULL) and @publication_type is NULL begin raiserror(20587, 16, -1, '@publication_type', 'sp_replmonitorhelpsubscription') return (1) end -- -- validate @mode -- if (@mode not in (@modeallsubscription,@modeinerroronly,@modeinwarningonly,@modeinerrorandwarningonly ,@modetop25worstperforming,@modetop50worstperforming,@modesynchronizing,@modenotsynchronizing)) begin raiserror(20587, 16, -1, '@mode', 'sp_replmonitorhelpsubscription') return (1) end -- -- initialize constants -- select @expiration = 1 ,@latency = 2 ,@curdate = getdate() -- -- create temp table to get the monitoring data -- create table #tmp_replication_monitordata ( publication_id int ,publisher sysname ,publisher_srvid int ,publisher_db sysname ,publication sysname ,publication_type int ,agent_type int ,agent_name sysname ,job_id uniqueidentifier ,status int ,isagentrunningnow bit ,warning int -- latest session's warning ,last_distsync datetime -- last sync time ,agentstoptime datetime -- agent stop time ,distdb sysname null ,retention int ,time_stamp datetime null ,worst_latency int ,best_latency int ,avg_latency int ,cur_latency int ,mergePerformance int ,mergelatestsessionrunduration int ,mergelatestsessionrunspeed float ,mergelatestsessionconnectiontype int ,retention_period_unit tinyint ) if @@error != 0 begin raiserror(20507, 16, 3, '#tmp_replication_monitordata', 'tempdb') return 1 end -- -- build indices -- create nonclustered index nc1tmp_replication_monitordata on #tmp_replication_monitordata(publisher_srvid) create nonclustered index nc2tmp_replication_monitordata on #tmp_replication_monitordata(agent_type) create nonclustered index nc4tmp_replication_monitordata on #tmp_replication_monitordata(publisher) create nonclustered index nc5tmp_replication_monitordata on #tmp_replication_monitordata(publication, publisher_db) create nonclustered index nc6tmp_replication_monitordata on #tmp_replication_monitordata(agent_name) if (@@error != 0) return 1 -- Get publisher without port number exec sys.sp_MSget_server_portinfo @name = @publisher, @srvname = @publisher OUTPUT -- -- Get refresh data -- exec @retcode = sys.sp_replmonitorrefreshdata @publisher=@publisher ,@publisher_db=@publisher_db ,@publication=@publication ,@publication_type=@publication_type ,@refreshpolicy = @refreshpolicy if @@error != 0 or @retcode != 0 return 1 -- -- -- set @publication_type for specific publication query -- and validate -- if (@publication_type is null) begin select @publication_type = publication_type from #tmp_replication_monitordata where upper(publisher) = upper(@publisher) and publisher_db = @publisher_db and publication = @publication end else if (@publication_type not in (0,1,2)) begin raiserror(20587, 16, -1, '@publication_type', 'sp_replmonitorhelpsubscription') return (1) end -- -- Adjust the rows to return -- if (@mode in (@modetop25worstperforming, @modetop50worstperforming)) begin if (@mode = @modetop25worstperforming) select @topnum = case when (isnull(@topnum, 30) > 25) then 25 else @topnum end else select @topnum = case when (isnull(@topnum, 60) > 50) then 50 else @topnum end end -- -- process based on publication_type -- if (@publication_type in (0,1)) begin create table #tmp_subscriptiondata ( -- static publisher_srvid int ,publisher_db sysname ,publication sysname ,publication_id int ,subscriber_id int ,subscriber sysname null ,subscriber_db sysname ,publication_type int -- Type of publication: 0 = Transactional, 1 = Snapshot, 2 = Merge ,subtype int -- Type of subscription: dbo.MSmerge_subscriptions.subscription_type -- 0 = Push, 1 = Pull, 2 = Anonymous ,subscription_time datetime -- dynamic ,status int ,warning int ,monitorranking int ,last_distsync datetime -- tran ,distribution_agentname sysname null ,logreaderagent_status int null ,latency int ,latencythreshold int ,agentnotrunning int ,agentnotrunningthreshold int ,timetoexpiration int ,expirationthreshold int -- merge -- fill mergesubscriptionfriendlyname by dbo.MSmerge_subscriptions.description ,mergesubscriptionfriendlyname sysname null ,mergeconnectiontype int null ,mergeagentname sysname null ,mergeagentlocation sysname null -- among all session of this subscription, depending on latest connection type -- calc average of run_speed among the subset, -- current_run_speed/avg_run_speed ,mergePerformance int null ,mergerunspeed float null --latest running session ,mergerunduration int null --latest running session -- Added later - see if we can reorder ,distributionagentjobid binary(16) null ,mergeagentjobid binary(16) null ,distributionagentid int null ,distributionagentprofileid int null ,mergeagentid int null ,mergeagentprofileid int null ,logreaderagentname sysname null ---only for SQL11 or later ,publisher sysname null ) create clustered index csubscriptiondata on #tmp_subscriptiondata (monitorranking desc) create index nc1subscriptiondata on #tmp_subscriptiondata (distribution_agentname) create index nc2subscriptiondata on #tmp_subscriptiondata (latency desc) create index nc3subscriptiondata on #tmp_subscriptiondata (mergePerformance) create index nc4subscriptiondata on #tmp_subscriptiondata (publisher_srvid, publisher_db, publication_id, subscriber_db, subscriber_id) create index nc5subscriptiondata on #tmp_subscriptiondata (mergeagentname) -- ************************************** -- build #tmp_subscriptiondata for tran -- both snaphot and transactional subscriptions will have a distribution agent -- Get the subscription details based on the distribution agent -- insert into #tmp_subscriptiondata ( subscriber_id ,subscriber ,subscriber_db ,publisher_srvid ,publisher_db ,publication ,publication_id ,publication_type ,subtype ,distribution_agentname ,distributionagentjobid ,distributionagentid ,distributionagentprofileid ,publisher ) select distinct s.subscriber_id, case when s.anonymous_subid is not null then -- anonymous subscription upper(s.subscriber_name) -- name is stored in subscriber_name instead of sys.servers else upper(srv.srvname) end ,s.subscriber_db ,r.publisher_srvid ,r.publisher_db ,r.publication ,r.publication_id ,r.publication_type ,s.subscription_type ,r.agent_name ,s.job_id ,s.id ,s.profile_id ,r.publisher from #tmp_replication_monitordata as r join dbo.MSdistribution_agents as s with (nolock) on r.publisher_srvid = s.publisher_id and r.publisher_db = s.publisher_db collate database_default and (r.publication = s.publication collate database_default or s.publication = 'ALL' collate database_default) and s.subscriber_id >= 0 -- skip the virtual subscriptions and (r.agent_type & 3) = 3 -- we select the distribution agent entries to optimize and (@exclude_anonymous = 0 or s.anonymous_subid is null) -- anonymous and r.job_id = cast(s.job_id as uniqueidentifier) join MSreplservers as srv on srv.srvid = s.subscriber_id -- -- get the latest subscription_time for the subscription to each publication -- update #tmp_subscriptiondata set subscription_time = (select max(s.subscription_time) from dbo.MSsubscriptions as s with (nolock) where subscriber_id = s.subscriber_id and subscriber_db = s.subscriber_db and publisher_srvid = s.publisher_id and publisher_db = s.publisher_db and publication_id = s.publication_id and subtype = s.subscription_type) -- -- The logreader agent status needs to incorporated in the subscription status -- update #tmp_subscriptiondata set logreaderagent_status = isnull(r.status,0) ,logreaderagentname = r.agent_name from #tmp_subscriptiondata as s join #tmp_replication_monitordata as r on s.publisher_srvid = r.publisher_srvid and s.publisher_db = r.publisher_db and r.agent_type=2 and s.publication_type = 0 -- -- set the status of the subscription based on the type of -- publication and agents status that are involved -- populate the threshold values -- populate other fields from #tmp_replication_monitordata -- update #tmp_subscriptiondata set expirationthreshold = cast(sys.fn_replgetpublicationthreshold(r.publication_id, @expiration) as int) ,latencythreshold = cast(sys.fn_replgetpublicationthreshold(r.publication_id, @latency) as int) ,warning = r.warning ,status = case -- when snapshot publication - use distribution agent status when (r.publication_type = 1) then r.status -- else we have transactional publication else case -- Error = when any one agent has error when (r.status = 6 or logreaderagent_status = 6) then 6 -- Retry = when any one agent has retry when (r.status = 5 or logreaderagent_status = 5) then 5 -- Stopped = when any one agent has stopped when (r.status = 2 or logreaderagent_status = 2) then 2 -- Idle (Running) = when both agents are idle when (r.status = 4 and logreaderagent_status = 4) then 4 -- Inprogress (Running) = when any one agent is in progress and the other is still running when (r.status = 3 and logreaderagent_status in (3, 4) or logreaderagent_status = 3 and r.status in (3, 4)) then 3 -- Startup (Running) = when any one agent is in startup and the other is still running when (r.status = 1 and logreaderagent_status in (1, 3, 4) or logreaderagent_status = 1 and r.status in (1, 3, 4)) then 1 -- we should not come here else 0 end end ,latency = r.cur_latency ,agentnotrunning = case when (r.agentstoptime is null) then null else datediff(hour, r.agentstoptime, @curdate) end ,last_distsync = r.last_distsync ,timetoexpiration = case when (r.retention is null) then null else datediff(hour, @curdate, dateadd(hour, r.retention, subscription_time)) end from #tmp_replication_monitordata as r where r.agent_name = distribution_agentname -- -- Set the monitor ranking based on status, warning, publication_type -- update #tmp_subscriptiondata set monitorranking = sys.fn_replmonitorsubscriptionranking(status, warning, publication_type) -- -- set the rowcount if necessary -- if @topnum is not null set rowcount @topnum -- -- return the rowset based on mode -- select status ,warning ,subscriber ,subscriber_db ,publisher_db ,publication ,publication_type ,subtype ,latency ,latencythreshold ,agentnotrunning ,agentnotrunningthreshold ,timetoexpiration ,expirationthreshold ,last_distsync ,distribution_agentname ,mergeagentname ,mergesubscriptionfriendlyname ,mergeagentlocation ,mergeconnectiontype ,mergePerformance ,mergerunspeed ,mergerunduration ,monitorranking ,distributionagentjobid ,mergeagentjobid ,distributionagentid ,distributionagentprofileid ,mergeagentid ,mergeagentprofileid ,logreaderagentname ,publisher from #tmp_subscriptiondata where @mode in (@modeallsubscription, @modetop25worstperforming, @modetop50worstperforming) -- return all subscriptions or (@mode = @modeinerroronly and monitorranking = 60) -- return only ones in error or (@mode = @modeinwarningonly and monitorranking between 50 and 59 ) -- return only ones with warning or (@mode = @modeinerrorandwarningonly and monitorranking between 50 and 60 ) -- return only ones with warning or error or (@mode = @modesynchronizing and status in (1,3,4)) -- return only ones with running agents or (@mode = @modenotsynchronizing and status not in (1,3,4)) -- return only ones with stopped agents order by monitorranking desc -- highest rank will be shown first ,latency desc -- highest latency first end else begin -- ************************************ -- handle merge subscriptions declare @publisher_id smallint select @publisher_id = srvid from MSreplservers where UPPER(srvname collate database_default) = UPPER(@publisher) collate database_default -- -- set the rowcount if necessary -- if @topnum is not null set rowcount @topnum -- -- return the rowset based on mode -- select sub.status ,sub.warning ,sub.subscriber ,sub.subscriber_db ,sub.publisher_db ,sub.publication ,sub.publication_type ,sub.subtype ,sub.latency ,sub.latencythreshold ,sub.agentnotrunning ,sub.agentnotrunningthreshold ,timetoexpiration = case when (sub.retention is null) then null else (case when sub.retention_period_unit = 0 then datediff(hour, @curdate, dateadd(hour, sub.retention, sub.last_distsync)) else (datediff(hour, @curdate, sys.fn_add_units_to_date(sub.retention, sub.retention_period_unit, sub.last_distsync))) end) end ,sub.expirationthreshold ,sub.last_distsync ,sub.distribution_agentname ,sub.mergeagentname ,sub.mergesubscriptionfriendlyname ,sub.mergeagentlocation ,sub.mergeconnectiontype ,sub.mergePerformance ,sub.mergerunspeed ,sub.mergerunduration ,sub.monitorranking ,sub.distributionagentjobid ,sub.mergeagentjobid ,sub.distributionagentid ,sub.distributionagentprofileid ,sub.mergeagentid ,sub.mergeagentprofileid ,sub.logreaderagentname ,sub.publisher from ( select -- static subscriber = upper(agents.subscriber_name) ,subscriber_id = agents.subscriber_id ,subscriber_db = agents.subscriber_db ,publisher_srvid = r.publisher_srvid ,publisher_db = r.publisher_db ,publication = r.publication ,publication_id = r.publication_id ,publication_type = r.publication_type ,subtype = case when agents.anonymous_subid is not null then 2 else s.subscription_type end -- dynamic ,status = r.status ,warning = r.warning ,agentnotrunning = case when (r.agentstoptime is null) then null else datediff(hour, @curdate, r.agentstoptime) end ,last_distsync = r.last_distsync ,subscription_time = s.subscription_time -- merge perf ,mergeagentname = agents.name --,'agentname' ,mergeagentlocation = case when s.subscription_type=0 then @publisher else UPPER(agents.subscriber_name) end ,mergesubscriptionfriendlyname = s.description ,mergeconnectiontype = mergelatestsessionconnectiontype ,mergerunduration = r.mergelatestsessionrunduration --2,NULL ,mergerunspeed=r.mergelatestsessionrunspeed ,mergePerformance = r.mergePerformance ,mergeagentjobid = agents.job_id ,mergeagentid = agents.id ,mergeagentprofileid = agents.profile_id ,retention = r.retention ,retention_period_unit = r.retention_period_unit ,monitorranking = case when (r.status > 5 or r.warning > 0) then sys.fn_replmonitorsubscriptionranking(r.status, r.warning, r.publication_type) else (case when r.status = 0 then 50 else (case when r.status = 5 then 40 else (case when r.status = 2 then 20 else 30 end) end) end) end ,latency = NULL ,latencythreshold = NULL ,agentnotrunningthreshold = NULL ,distribution_agentname = NULL ,distributionagentjobid = NULL ,distributionagentid = NULL ,distributionagentprofileid = NULL ,logreaderagentname = NULL ,expirationthreshold = NULL ,r.publisher from #tmp_replication_monitordata as r join dbo.MSmerge_agents agents with (nolock) on agents.name = r.agent_name collate database_default and (@exclude_anonymous = 0 or agents.anonymous_subid is null) and ((@publisher is null and agents.publisher_id is not null) or (agents.publisher_id=@publisher_id)) and agents.publisher_db=r.publisher_db collate database_default and agents.publication=r.publication collate database_default and (r.agent_type & 4) = 4 -- for merge only left outer join dbo.MSmerge_subscriptions as s with (nolock) on r.publisher_srvid = s.publisher_id and r.publisher_db = s.publisher_db collate database_default and r.publication_id = s.publication_id and s.subscriber is not NULL -- skip the virtual entries in dbo.MSsubscriptions and UPPER(agents.subscriber_name)=UPPER(s.subscriber) and agents.subscriber_db=s.subscriber_db ) as sub where @mode = @modeallsubscription -- return all subscriptions or (@mode in (@modetop25worstperforming, @modetop50worstperforming) and sub.mergePerformance is not null) -- return worst 25/50 subscriptions based on perf. or (@mode = @modeinerroronly and sub.monitorranking = 60) -- return only ones in error or (@mode = @modeinwarningonly and sub.monitorranking between 50 and 59 ) -- return only ones with warning or (@mode = @modeinerrorandwarningonly and sub.monitorranking between 50 and 60 ) -- return only ones with warning or error or (@mode = @modesynchronizing and sub.status in (1,3,4)) -- return only ones with running agents or (@mode = @modenotsynchronizing and sub.status not in (1,3,4)) -- return only ones with stopped agents order by sub.monitorranking desc -- highest rank will be shown first ,sub.mergePerformance asc -- lowest mergePerformance first end -- -- all done -- return 0 end
以上是并没有找打计算latency的相关逻辑,之前已经经过各种重启,disable/enable 复制相关的job,均不生效,所以准备采取非常规手段
解决
考虑到订阅发布的复制历史日志,仅供参考使用,并不是必须的,且存在自动过期清理,于是,手起刀落,直接删掉超出当前时间的日志
delete from msdistribution_history where time>'2024-01-15 22:57:44.220'
sp_replmonitorhelpsubscription系统存储过程的结果页恢复正常
总结
个人并非熟悉测试的套路,但是一直摒弃通过修改服务器时间来达到某种测试的目的,这种方式引起的问题是多方面和未知的。
同时话题展开一点,不止一次遇到过类似场景,由于表中存储一个从客户端获取的时间,而客户端的时间设置错乱,并非是一个准确的当前时间,导致记录到数据库里的数据,从当前看,逻辑上各种诡异(比如从自增IId大的数据的生成时间比Id小的要晚这种逻辑错误)
因此考虑在设计表的时候,增加一个跟业务逻辑无关的时间戳字段,并且这个字段只能服务器端默认生成,不能由客户端写入,方面出现类似问题时做诊断线索。