监控-作业执行

原始脚本来自TG,自己对部分脚本做了调整,分享出来仅供参考,请勿整篇Copy!


使用以下语句获取【作业异常信息】

复制代码
USE [DBA_Monitor]
GO
/****** 对象:  StoredProcedure [dbo].[DBA_Pro_Get_JobInfo]    脚本日期: 09/08/2017 10:39:47 ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
/******************************************************************
【概要说明】
    <作    者>    DBA
    <创建时间>    2015/01/19 14:27
    <中文名称>    获取作业异常信息
    <功能说明>    采集当前SQL实例下面的作业异常信息
    <调用方式>    作业方式定时调用为主;错误处理后,维护人员手工调用为辅
    <执行说明>    先决条件:正确的传递实例编号参数@InstanceID
                 注意事项:暂无
    <调用示例>  exec DBA_Pro_Get_JobInfo 100233,'UestMail@163.com'
【修订记录】
-------------------------------------------------------------------
    <2015/01/19 14:27>  <DBA>: 创建
    <2015/02/04 14:27>  <DBA>: 修改过程,在处理作业执行时间过长时,临时表的messageinfo长度不够用
******************************************************************/
ALTER proc [dbo].[DBA_Pro_Get_JobInfo]
    @InstanceID    int,
    @Recipients    VARCHAR(200)=null
AS
BEGIN
    SET NOCOUNT ON

    IF (@InstanceID<100000 or @InstanceID>999999)
    begin
        print '数据库实例编号参数错误!'
        return
    end

    BEGIN TRY

    Declare @now datetime,                --当前时间
            @starttime datetime,        --一个小时前
            @html_tb VARCHAR(MAX),        --邮件内容
            @sub varchar(100),            --邮件标题
            @ip INT,                    --IP地址最后的段
            @Today    int,
            @Tomorrow    int,
            @EndTime    int,
            @BeginTime    int,
            @html_tb1 VARCHAR(MAX),        --邮件内容
            @html_tb2 VARCHAR(MAX)        --邮件内容

    select @html_tb='',@html_tb1='',@html_tb2=''
    select @now=GETDATE()
    select @starttime=DATEADD(HH,-1,@now)

    --作业执行错误
    if OBJECT_ID('tempdb..#JobError')is not null
        drop table #JobError
    create table #JobError
    (
        jobname varchar(128) not null,
        ownname varchar(64) not null,
        step_id int not null,
        step_name varchar(128) not null,
        messageinfo varchar(max) null,
        run_status int not null,
        run_datetime datetime not null,
        run_duration varchar(8) not null
    )
    /*
    INSERT INTO #JobError
    SELECT S.name,suser_sname(S.owner_sid) ownname,H.step_id,H.step_name,H.message,H.run_status
    ,msdb.dbo.agent_datetime(H.run_date,H.run_time) run_datetime
    ,convert(varchar,msdb.dbo.agent_datetime(H.run_date,H.run_duration),108) run_duration
    FROM 
        msdb.dbo.sysjobs S WITH(NOLOCK)
    INNER JOIN
        msdb.dbo.sysjobhistory H WITH(NOLOCK) ON S.job_id=H.job_id
    --LEFT JOIN
        --sys.syslogins L WITH(NOLOCK) ON S.owner_sid=L.sid
    WHERE H.run_status IN (0,3) AND step_id<>0 
        AND msdb.dbo.agent_datetime(H.run_date,H.run_time)>@starttime
    */
    --alter 2016-11-17 部分作业失败,比如作业所有者没有访问服务器的权限,在历史记录只有step_id=0的记录
    INSERT INTO #JobError
    select isnull(b.name,a.name) jobname
    ,isnull(b.ownname,a.ownname) ownname
    ,isnull(b.step_id,a.step_id) step_id
    ,isnull(b.step_name,a.step_name) step_name
    ,isnull(b.message,a.message) message
    ,isnull(b.run_status,a.run_status) run_status
    ,isnull(b.starttime,a.starttime) run_datetime
    ,isnull(b.run_duration,a.run_duration) run_duration
    from (
    --step_id=0
    select s.name,suser_sname(s.owner_sid) ownname,h.step_id,h.step_name,h.message,h.run_status 
    ,msdb.dbo.agent_datetime(h.run_date,h.run_time) starttime
    ,convert(varchar,msdb.dbo.agent_datetime(h.run_date,h.run_duration),108) run_duration
    ,dateadd(s
    ,datediff(s,msdb.dbo.agent_datetime(20160101,0),msdb.dbo.agent_datetime(20160101,h.run_duration))
    ,msdb.dbo.agent_datetime(h.run_date,h.run_time)) endtime
    FROM
        msdb.dbo.sysjobs s WITH(NOLOCK)
    INNER JOIN
        msdb.dbo.sysjobhistory h WITH(NOLOCK) ON s.job_id=h.job_id
    where h.run_status in(0,3) and h.step_id=0
        AND msdb.dbo.agent_datetime(H.run_date,H.run_time)>@starttime
    ) a
    --alter 2017-09-08 left join->full join 
    --多步骤作业,设置步骤成功/失败时转到下一步,其中第一步失败,后面步骤成功,整体作业报告成功(step_id=0 and run_status=1)
    full join(
    --step_id>0
    select s.name,suser_sname(s.owner_sid) ownname,h.step_id,h.step_name,h.message,h.run_status 
    ,msdb.dbo.agent_datetime(h.run_date,h.run_time) starttime
    ,convert(varchar,msdb.dbo.agent_datetime(h.run_date,h.run_duration),108) run_duration
    ,dateadd(s
    ,datediff(s,msdb.dbo.agent_datetime(20160101,0),msdb.dbo.agent_datetime(20160101,h.run_duration))
    ,msdb.dbo.agent_datetime(h.run_date,h.run_time)) endtime
    FROM
        msdb.dbo.sysjobs s WITH(NOLOCK)
    INNER JOIN
        msdb.dbo.sysjobhistory h WITH(NOLOCK) ON s.job_id=h.job_id
    where h.run_status in(0,3) and h.step_id>0
        AND msdb.dbo.agent_datetime(H.run_date,H.run_time)>@starttime
    ) b
    on a.name=b.name
    and a.starttime<=b.starttime
    and a.endtime>=b.endtime

    INSERT INTO Info_JobHistory(instanceid,jobname,ownname,step_id,step_name,messageinfo,run_status,run_datetime,run_duration,flag)
    SELECT @InstanceID,jobname,ownname,step_id,step_name,messageinfo,run_status,run_datetime,run_duration,1 as flag
    FROM #JobError
    ORDER BY run_datetime,jobname,step_id

    --作业执行时间过长
    if OBJECT_ID('tempdb..#JobRunLongtime')is not null
        drop table #JobRunLongtime
    create table #JobRunLongtime
    (
        jobname varchar(128) not null,
        ownname varchar(64) not null,
        step_id int not null,
        step_name varchar(128) not null,
        messageinfo varchar(max) null,
        run_status int not null,
        run_datetime datetime not null,
        run_duration varchar(8) not null,
        prev_run_duration varchar(8) not null,
        [growth] varchar(8) default('0')
    )

    ;with leno as
    (
        SELECT S.name,suser_sname(S.owner_sid) ownname,H.step_id,H.step_name,H.message,H.run_status
        ,msdb.dbo.agent_datetime(H.run_date,H.run_time) run_datetime,H.run_duration
        ,ROW_NUMBER()over(partition by s.name,h.step_id order by h.run_date desc,h.run_time desc)cwt
        FROM 
            msdb.dbo.sysjobs S WITH(NOLOCK)
        INNER JOIN
            msdb.dbo.sysjobhistory H WITH(NOLOCK) ON S.job_id=H.job_id
        --LEFT JOIN
            --sys.syslogins L WITH(NOLOCK) ON S.owner_sid=L.sid
        WHERE step_id<>0
            AND H.run_status=1
            AND msdb.dbo.agent_datetime(H.run_date,H.run_time)>@starttime
    )
    ,leno1 as
    (
        SELECT * FROM leno s
        WHERE cwt=1 AND run_duration>30
    )
    ,leno2 as
    (
        SELECT * FROM leno s
        WHERE cwt=2 AND run_duration>30
    )
    INSERT INTO #JobRunLongtime    
    SELECT a.name,a.ownname,a.step_id,a.step_name,a.message,a.run_status,a.run_datetime
    ,CONVERT(VARCHAR,msdb.dbo.agent_datetime(20160101,a.run_duration),108) run_duration
    ,CONVERT(VARCHAR,msdb.dbo.agent_datetime(20160101,b.run_duration),108) prev_run_duration
    ,CONVERT(DECIMAL(9,2),100.0*DATEDIFF(s,msdb.dbo.agent_datetime(20160101,b.run_duration),msdb.dbo.agent_datetime(20160101,a.run_duration))
    /DATEDIFF(s,msdb.dbo.agent_datetime(20160101,0),msdb.dbo.agent_datetime(20160101,b.run_duration))) growth
    FROM leno1 a
    INNER JOIN leno2 b
    ON a.name=b.name and a.step_name=b.step_name
    WHERE a.run_duration>b.run_duration
    AND DATEDIFF(s,msdb.dbo.agent_datetime(20160101,b.run_duration),msdb.dbo.agent_datetime(20160101,a.run_duration))>10

    INSERT INTO Info_JobHistory(instanceid,jobname,ownname,step_id,step_name,messageinfo,run_status,run_datetime,run_duration,prev_run_duration,[growth(%)],flag)
    SELECT @InstanceID,jobname,ownname,step_id,step_name,messageinfo,run_status,run_datetime,run_duration,prev_run_duration,growth,2 as flag
    FROM #JobRunLongtime
    ORDER BY run_datetime,jobname,step_id

    /*--发送邮件
    select @IP=Reverse(left(Reverse(IntranetIP),charindex('.',Reverse(IntranetIP))-1)) FROM Info_Hardware where ServerID=left(@InstanceID,4)
    declare @HostName varchar(64)
    select @HostName=HostName From dbo.Info_Instance where InstanceID=@InstanceID

    if @ip is not null
        set @sub='['+cast(@ip as varchar)+'][重要] 数据库作业执行预警'
    else
        set @sub='[重要] 数据库作业执行预警'

    --#JobError
    IF EXISTS(SELECT TOP 1 1 FROM #JobError AS ih)
    BEGIN
        SET @html_tb1 =
               N'<style>td{FONT-FAMILY: Tahoma,宋体; FONT-SIZE: 12px; LINE-HEIGHT: 20px; TEXT-DECORATION: none;BORDER-BOTTOM: black 1px solid;BORDER-RIGHT: black 1px solid;}th{FONT-FAMILY: Tahoma,宋体; FONT-SIZE: 12px; LINE-HEIGHT: 20px; TEXT-DECORATION: none;BORDER-BOTTOM: black 1px solid;BORDER-RIGHT: black 1px solid;}</style>
               <H2>'+@HostName+'数据库作业执行错误预警</H2>
               报表时间:'+Convert(varchar(19),getdate(),121)+
               N'<table width = "100%" table border="0" cellspacing="0" cellpadding="0" style="BORDER-LEFT: black 1px solid;BORDER-TOP: black 1px solid">
               <tr><th>序号</th><th>实例编号</th><th>作业名称</th><th>所有者</th><th>作业步骤ID</th><th>作业步骤名称</th>
               <th>作业执行结果消息</th><th>作业执行状态</th><th>作业执行时间</th><th>作业执行时长</th>
               </tr>' + 
               CAST (( select td = ROW_NUMBER()over(order by run_datetime desc),'',
                              td = ISNULL(@InstanceID,'NULL'),'',
                              td = jobname,'',
                              td = ownname ,'',
                              td = step_id,'',
                              td = step_name,'',
                              td = messageinfo ,'',
                              td = run_status,'',
                              td = convert(varchar(19),run_datetime,121),'',
                              td = run_duration,''
                        FROM 
                            #JobError AS ih
                        FOR XML PATH('tr')
                        ) AS NVARCHAR(MAX) 
                    ) +
                N'</table><br><br><br>'
    END

    --#JobRunLongtime
    IF EXISTS(SELECT TOP 1 1 FROM #JobRunLongtime AS ih where cast(replace(growth,'%','') as INT)>=100)
    BEGIN
        SET @html_tb2 =
               N'<style>td{FONT-FAMILY: Tahoma,宋体; FONT-SIZE: 12px; LINE-HEIGHT: 20px; TEXT-DECORATION: none;BORDER-BOTTOM: black 1px solid;BORDER-RIGHT: black 1px solid;}th{FONT-FAMILY: Tahoma,宋体; FONT-SIZE: 12px; LINE-HEIGHT: 20px; TEXT-DECORATION: none;BORDER-BOTTOM: black 1px solid;BORDER-RIGHT: black 1px solid;}</style>
               <H2>'+@HostName+'数据库作业执行时间过长预警</H2>
               报表时间:'+Convert(varchar(19),getdate(),121)+
               N'<table width = "100%" table border="0" cellspacing="0" cellpadding="0" style="BORDER-LEFT: black 1px solid;BORDER-TOP: black 1px solid">
               <tr><th>序号</th><th>实例编号</th><th>作业名称</th><th>所有者</th><th>作业步骤ID</th><th>作业步骤名称</th>
               <th>作业执行结果消息</th><th>作业执行时间</th><th>本次作业执行时长</th><th>上次作业执行时长</th><th>执行时长增长百分比</th>
               </tr>' + 
               CAST (( select td = ROW_NUMBER()over(order by cast(replace(growth,'%','') as float) desc),'',
                              td = ISNULL(@InstanceID,'NULL'),'',
                              td = jobname,'',
                              td = ownname ,'',
                              td = step_id,'',
                              td = step_name,'',
                              td = messageinfo ,'',
                              td = convert(varchar(19),run_datetime,121),'',
                              td = run_duration,'',
                              td = prev_run_duration,'',
                              td = growth+'%',''
                        FROM 
                            #JobRunLongtime AS ih
                        where cast(replace(growth,'%','') as float)>=100
                        FOR XML PATH('tr')
                        ) AS NVARCHAR(MAX) 
                    ) +
                N'</table><br><br><br>'
    END

    SET @html_tb=@html_tb1+@html_tb2

    IF ((EXISTS(SELECT TOP 1 1 FROM #JobError AS ih))
        OR
       (EXISTS(SELECT TOP 1 1 FROM #JobRunLongtime AS ih where cast(replace(growth,'%','') as INT)>=100)))
    BEGIN
        EXEC msdb.dbo.sp_send_dbmail
            @profile_name = 'LKProfile',
            @subject = @sub,
            @recipients = @Recipients,
            @body = @html_tb,
            @body_format = 'HTML' ;
    END*/

    if OBJECT_ID('tempdb..#JobError')is not null
        drop table #JobError
    if OBJECT_ID('tempdb..#JobRunLongtime')is not null
        drop table #JobRunLongtime
    END TRY
    BEGIN CATCH
        insert into ExecErrorLog(InstanceID,Error_Procname,Error_Numbers,Error_Messages,Error_Serverity,Error_States,Error_Lines)
        SELECT @InstanceID,ERROR_PROCEDURE(),ERROR_NUMBER(),ERROR_MESSAGE(),ERROR_SEVERITY(),ERROR_STATE(),ERROR_LINE()
    END CATCH
END
View Code
复制代码

语句每小时执行一次,基本都能正常获取出错作业以及执行时长过长作业。但是从作业历史记录发现部分异常作业没有监控到



第一张图显示,作业在2016-12-24 06:00:24.000开始执行第二步(step_id=2),步骤持续8分12秒后出错(run_status=0)
第二张图显示,作业在2017-01-14 08:40:00.000开始执行第一步(step_id=1),步骤持续117小时26分37秒后成功(run_status=1)
按理来说,监控语句应该报告上面的出错作业,以及执行时长过长的作业。可是并没有收到相关的邮件(虽然有其他邮件间接地反映问题存在,但没能引起及时、足够的关注)。
首先来看获取出错作业的核心语句

语句获取作业执行失败(状态=0)/取消(状态=3),且开始运行时间为最近1小时内的。也就是说,2016-12-24 06:06:00执行监控语句,它只会筛选开始运行时间在2016-12-24 05:06:00~2016-12-24 06:06:00之间失败或取消的作业。而上述第一个作业是在2016-12-24 06:00:24开始运行,在2016-12-24 06:08:36出错。监控语句运行时,它还在执行(未出错)。下一次监控语句的执行时间为2016-12-24 07:06:00执行监控语句,它只会筛选开始运行时间在2016-12-24 06:06:00~2016-12-24 07:06:00之间失败或取消的作业,显然,它忽略了前面出错的作业。
再来看获取执行时长过长作业的核心语句

语句先获取作业执行成功(状态=1),且开始运行时间为最近1小时内的,然后对比前后两次执行时长。显然,它也会忽略第二个作业(在2017-01-14 08:40:00.000开始执行,持续117小时26分37秒后成功)。
问题的根源就是,作业执行时长run_duration跨过监控语句的下次执行时间。要想避免此类异常,语句中应该使用Time the job or step stoped(即检查run_date:run_time+run_duration,也就是完成时间为最近1小时内的数据)。
其实应该将那些正在执行,且持续时间较长的作业也预警出来。msdb.dbo.sysjobhistory不会记录正在执行的作业,需要从msdb.dbo.sysjobactivity获取

复制代码
--获取正在执行的作业(时实更新)
SELECT TOP 50 sj.name,
       sja.start_execution_date,
       sja.last_executed_step_date,
       sja.stop_execution_date,
       sja.next_scheduled_run_date,
       getdate() currentdate    
FROM   msdb.dbo.sysjobs sj
       INNER JOIN (
                SELECT *
                FROM   msdb.dbo.sysjobactivity
                WHERE  session_id IN (SELECT MAX(session_id) session_id
                                      FROM   msdb.dbo.sysjobactivity)
            )sja
            ON  sj.job_id = sja.job_id
WHERE  sj.enabled = 1
       AND sja.start_execution_date IS NOT NULL
       AND sja.stop_execution_date IS NULL
ORDER BY sj.name

--查看指定作业历史记录
select top 20 sj.name,sjh.step_id,sjh.message,sjh.run_status
      ,msdb.dbo.agent_datetime(run_date,run_time) run_time
      ,convert(varchar,msdb.dbo.agent_datetime(run_date,run_duration),108) run_duration
  from msdb.dbo.sysjobhistory sjh
 inner join msdb.dbo.sysjobs sj
    on sjh.job_id=sj.job_id
 where sj.name='JobName'
 order by instance_id desc
View Code
复制代码


注意msdb.dbo.sysjobhistory中的run_duration是HHMMSS格式,不能简单用msdb.dbo.agent_datetime格式化持续时间。
我们在第二个作业,看到持续时间达到117小时,针对此条记录使用msdb.dbo.agent_datetime会报"将 expression 转换为数据类型 nvarchar 时出现算术溢出错误"。实际msdb.dbo.agent_datetime的第二个参数只能取0~235959。感觉转换run_duration为日常熟悉的格式还是不太方便。

posted @   Uest  阅读(305)  评论(0编辑  收藏  举报
编辑推荐:
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
阅读排行:
· 周边上新:园子的第一款马克杯温暖上架
· Open-Sora 2.0 重磅开源!
· 分享 3 个 .NET 开源的文件压缩处理库,助力快速实现文件压缩解压功能!
· Ollama——大语言模型本地部署的极速利器
· DeepSeek如何颠覆传统软件测试?测试工程师会被淘汰吗?
点击右上角即可分享
微信分享提示