on June 7, 2009
This post is to share a script I’ve been working on periodically over the last couple of months to monitor and report on replication latency at the distributor.
I use this in monitoring transactional replication with a stand-alone distributor.
The basic purpose for the script is:
- Check the number of undelivered commands for all active publications
- Log the data found to a table (which is collected on a centralized server so the history can be displayed graphically in Reporting Services 2008– that code not in this post)
- When the number of undelivered commands is higher than allowed, fail the job and send an email with a table highlighting the publications that are behind
- Accept a parameter of dynamic length to allow specifying publications that should not be monitored as “production” (ie, do not alert in monitoring). Our repl servers are not SQL 2008 yet, so I use an XML wad rather than a TVP. I do love TVP’s for this type of thing though.
- And be able to accomplish all these tasks from a non-system database. (No custom monitoring code in system or application dbs!)
So this script shows not only how to check for undelivered commands, but also how to build a quick and easy HTML table to summarize data in emails. I always really like having as much data as possible about the state of the system in an email which is requesting investigation. (For monitoring tie in/SQL job failures are written to the event log which are then picked up.)
Note: there are some email addresses at the bottom of the sproc which I should have parameterized. I am leaving this way because everyone should read the sproc thoroughly before installing it anywhere ;)
The table for Logging
I pull all results to a centralized server, so this table stays small
if object_id('dba.ReplStatusHistory') is null
create table dba.replStatusHistory (
replStatusId bigint identity primary key
, replStatusTime datetime default (getdate()) not null
, distributionDb sysname not null
, publishingServer sysname not null
, publishingDb sysname not null
, subscribingServer sysname not null
, subscribingDb sysname not null
, subscriptionStatusId tinyint not null
, PendingCmdCount bigint not null
, exception bit default 0
, publicationName sysname not null
)
The Sproc
--Usage sample:
--exec [dba].[monitorReplStatus] @testMode=1, @distributionDb='distribution', @sendEmail=2
CREATE PROCEDURE [dba].[monitorReplStatus]
@nocEmailThreshold int = 100000
, @testMode bit =0
, @distributionDb sysname
, @testModeEmailAddress nvarchar(1000) = 'some@body.com'
, @sendEmail tinyint = 2 -- 0= don't mail, 1= always mail, 2=email only on error
, @exceptionXML xml = N'
<EXCEPTION>
<Exclude serverName="SOMESERVER" />
<Exclude serverName="SOMEOTHERSERVER" />
</EXCEPTION>
'
AS
SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;
SET NOCOUNT ON;
declare @html nvarchar(max)
, @emailRecipients nvarchar(2000)
, @subj nvarchar(2000)
, @replStatusTime datetime
, @loggingWarning nvarchar(max)
, @sqlGetSubscribers nvarchar(max)
, @raiseErr bit
, @i tinyint --iterator for loop
, @publishingServer sysname
, @publishingDb sysname
, @publicationName sysname
, @subscriptionType tinyint
, @subscribingServer sysname
, @subscribingDb sysname
, @getStatsSql nvarchar(max)
set @replStatusTime=getdate();
declare @replSubscribers table (
replSubscriberId int identity primary key
, publishingServer sysname not null
, publishingDb sysname not null
, publicationName sysname not null
, subscriptionType tinyint default (1)
, subscribingServer sysname not null
, subscribingDb sysname not null
, subscriptionStatusId tinyint not null
, subscriptionStatus nvarchar(256) not null
, exception bit default(0)
, pendingCmdCount bigInt null
, estimatedProcessSeconds bigInt null
)
declare @replStatus table (
replSubscriberId int primary key
, pendingCmdCount bigInt
, estimatedProcessSeconds bigInt
)
declare @exceptions table (
serverName sysname not null
)
set @subj='Replication Status from ' + @@SERVERNAME + '.' + @distributionDb;
set @loggingWarning='' -- can't be null or it won't work later
set @raiseErr=0 -- should not be null
--First, parse exclusions
insert @exceptions (serverName)
SELECT DISTINCT
serverName = x.serverName
FROM
(
SELECT DISTINCT
serverName = e.i.value('@serverName','sysname')
FROM @exceptionXML.nodes('EXCEPTION/Exclude') e(i)
where e.i.value('@serverName','sysname') is not null
) x
--Now, get our list of subscribers for this distribution db
select @sqlGetSubscribers='
use ' + @distributionDb + ';
select
publishingServer = pub.srvname
, publishingDb = coalesce(sb.publisher_db,''????'')
, publicationName = pb.publication
, subscriptionType= sb.subscription_type
, subscribingServer = sub.srvname
, subscribingDb= coalesce(sb.subscriber_db,''????'')
, subscriptionStatusId=coalesce(sb.status,-1)
, subscriptionStatus= case sb.status when 0 then ''Inactive'' when 1 then ''Subscribed'' when 2 then ''Active'' else ''???'' end
from MSSubscriptions sb (nolock)
join MSPublications pb (nolock) on
sb.publication_id=pb.publication_id
join master.dbo.sysservers sub (nolock) on
sb.subscriber_id= sub.srvid
join master.dbo.sysservers pub (nolock) on
sb.publisher_id= pub.srvid
group by
pub.srvname
, sb.publisher_db
, pb.publication
, sb.subscription_type
, sub.srvname
, sb.subscriber_db
, sb.status
'
if @testMode=1 print @sqlGetSubscribers
begin try
insert @replSubscribers (publishingServer, publishingDb, publicationName, subscriptionType, subscribingServer, subscribingDb, subscriptionStatusId, subscriptionStatus)
exec sp_executesql @sqlGetSubscribers
if @@rowcount =0
begin
set @loggingWarning=@loggingWarning + 'No subscribers found.'
set @raiseErr=1
end
else
update @replSubscribers
set exception =1
from @replSubscribers rs
join @exceptions x on
rs.subscribingServer=x.serverName
end try
begin catch
set @loggingWarning=@loggingWarning + 'Could not pull subscriber list.'
set @raiseErr=1
end catch
set @i=1;
--Now get the undelivered commands for each subscriber
while @raiseErr != 1 and @i = @nocEmailThreshold and exception =0) >= 1
begin
set @sendEmail=1
end
--Fix up the email and send it
if @sendEmail=1
begin
SELECT @html=
'
BODY {background-color:floralwhite; font-family: sans-serif}
TABLE {border-width: 1px;border-style: solid;border-color: black;border-collapse: collapse;}
TH {border-width: 1px;padding: 2px;border-style: solid;border-color: black; background-color:#f0f0f0; text-align: center; padding: 8px;}
TD {border-width: 1px;padding: 2px;border-style: solid;border-color: black; background-color:#f9f9f9; text-align: center; padding: 8px;}
<H1>PROD Replication Undelivered Commands</H1>'
+ case when @loggingWarning is not null then '<H3><font color="red">' + @loggingWarning + '</font></H3>' else '' end
+ '
Replication status from ' + @@SERVERNAME + '.' + @distributionDb
+ ' is summarized below. Please ticket and escalate any instances of undelivered commands for production servers over <font color="red">'
+ cast(@nocEmailThreshold as nvarchar) + '</font> as as per the <a href="http://yourTSG/OrOpsGuide/'+ @@SERVERNAME + '">'+ @@SERVERNAME +' wiki</a>.
<H2>Production Servers</H2>
'
--Build the upper table to display production data
SELECT
@html=@html + '
<TABLE>
<TR>
<TH>
publishing server
</TH><TH>
publishing db
</TH><TH>
subscribing server
</TH><TH>
subscribing db
</TH><TH>
publication name
</TH><TH>
subscription status
</TH><TH>
undelivered commands
</TH>
</TR>
';
SELECT
@html=@html +
'<TR>
<TD>' + publishingServer
+ '</TD>
<TD>'+
publishingDb
+ '</TD>
<TD>'+
subscribingServer
+ '</TD>
<TD>'+
subscribingDb
+ '</TD>
<TD>'+
publicationName
+ '</TD>
<TD>'+
subscriptionStatus
+ '</TD>
<TD>'
+ case when coalesce(PendingCmdCount,-1) >= @nocEmailThreshold then '<font color="red">' else '' end
+ cast (coalesce(PendingCmdCount,-1) as nvarchar(256))
+ case when (coalesce(PendingCmdCount,-1) >= @nocEmailThreshold ) then '</font>' else '' end
+ '</TD>
</TR>
'
from @replSubscribers sub
where exception = 0
order by
publishingServer
, publishingDb
, subscribingServer
, subscribingDb
SELECT @html=@html +
'</TABLE>'
--Build the lower table to display exceptions (which are just an FYI)
SELECT
@html=@html + '
<H2>Non-Production Servers (informational only, do not ticket)</H2>
'
SELECT
@html=@html + '
<TABLE>
<TR>
<TH>
publishing server
</TH><TH>
publishing db
</TH><TH>
subscribing server
</TH><TH>
subscribing db
</TH><TH>
publication name
</TH><TH>
subscription status
</TH><TH>
undelivered commands
</TH>
</TR>
';
SELECT
@html=@html +
'<TR>
<TD>' + publishingServer
+ '</TD>
<TD>'+
publishingDb
+ '</TD>
<TD>'+
subscribingServer
+ '</TD>
<TD>'+
subscribingDb
+ '</TD>
<TD>'+
publicationName
+ '</TD>
<TD>'+
subscriptionStatus
+ '</TD>
<TD>'
-- don't color these red, they have exceptions....
+ cast (coalesce(PendingCmdCount,-1) as nvarchar(256))
+ '</TD>
</TR>
'
from @replSubscribers sub
where exception = 1
order by
publishingServer
, publishingDb
, subscribingServer
, subscribingDb
SELECT @html=@html +
'</TABLE>'
--Build the foooter
SELECT
@html=@html + '
'
if @testMode=1
begin
select * from @exceptions
select * from @replSubscribers sub
select
@emailRecipients = @testModeEmailAddress
end
else
select
@emailRecipients= 'some@body.com' + case when count(*) > 0 then ';some@body.com' else '' end
from @replSubscribers sub
where PendingCmdCount >= @nocEmailThreshold
and exception =0
exec msdb..sp_send_dbmail
@profile_name=@@SERVERNAME
, @recipients = @emailRecipients
, @subject= @subj
, @body_format = 'HTML'
, @body=@html
;
end
--Clean up history
delete from dba.ReplStatusHistory
where replStatusTime = @nocEmailThreshold and exception =0) >= 1
begin
set @raiseErr=1
set @loggingWarning=@loggingWarning+ 'Error threshold exceeded for one or more subscriptions, please investigate.'
end
--Finish up and raise an error if we need to
if @raiseErr=1
begin
print 'Logging warning: ' + @loggingWarning
RAISERROR(@loggingWarning,16,1)
end