diff --git a/conf/raid_megaraid.conf b/conf/raid_megaraid.conf new file mode 100644 index 0000000..8fa0bf6 --- /dev/null +++ b/conf/raid_megaraid.conf @@ -0,0 +1,12 @@ +# Description: MegaRaid Status +# Type: Agent or Agent (active) +# Key: raid.mega.status +# Type of Information: Character +# Show Value: As is + +# The value reported is like: +# State: OK: 0:0:RAID-1:2 drives:68GB:Optimal 0:1:RAID-5:4 drives:837GB:Optimal Drives:7 + +# You can add a simple trigger on this check like: +# { hostname:raid.mega.status.str( OK ) }=0 +UserParameter=raid.mega.status,/usr/bin/sudo /var/lib/zabbix/bin/check_raid_megaraid_sudo diff --git a/scripts/check_raid_megaraid_sudo b/scripts/check_raid_megaraid_sudo new file mode 100755 index 0000000..09afb35 --- /dev/null +++ b/scripts/check_raid_megaraid_sudo @@ -0,0 +1,227 @@ +#!/usr/bin/perl -w + +# check_megaraid_sas Nagios plugin +# Copyright (C) 2007 Jonathan Delgado, delgado@molbio.mgh.harvard.edu +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# +# +# Nagios plugin to monitor the status of volumes attached to a LSI Megaraid SAS +# controller, such as the Dell PERC5/i and PERC5/e. If you have any hotspares +# attached to the controller, you can specify the number you should expect to +# find with the '-s' flag. +# +# The paths for the Nagios plugins lib and MegaCli may need to me changed. +# +# $Author: delgado $ +# $Revision: #3 $ $Date: 2007/06/07 $ + +# Slightly modified by Daniel B. for zabbix +# 23 Apr 2009 + +use strict; +use Getopt::Std; + +our($opt_h, $opt_s, $opt_o, $opt_m, $opt_p); + + +getopts('hs:o:p:m:'); + +if ( $opt_h ) { + print "Usage: $0 [-s number] [-m number] [-o number]\n"; + print " -s is how many hotspares are attached to the controller\n"; + print " -m is the number of media errors to ignore\n"; + print " -p is the predictive error count to ignore\n"; + print " -o is the number of other disk errors to ignore\n"; + exit; +} + + +my $megacli = (-x '/opt/MegaRAID/MegaCli/MegaCli64') ? + '/opt/MegaRAID/MegaCli/MegaCli64' : '/opt/MegaRAID/MegaCli/MegaCli'; + +## Return codes for Nagios +my %ERRORS=('OK'=>0,'WARNING'=>1,'CRITICAL'=>2,'UNKNOWN'=>3,'DEPENDENT'=>4); + +my ($adapters); +my $hotspares = 0; +my $hotsparecount = 0; +my $pdbad = 0; +my $pdcount = 0; +my $mediaerrors = 0; +my $mediaallow = 0; +my $prederrors = 0; +my $predallow = 0; +my $othererrors = 0; +my $otherallow = 0; +my $result = ''; +my $status = 'OK'; + +sub max_state ($$) { + my ($current, $compare) = @_; + + if (($compare eq 'CRITICAL') || ($current eq 'CRITICAL')) { + return 'CRITICAL'; + } elsif ($compare eq 'OK') { + return $current; + } elsif ($compare eq 'WARNING') { + return 'WARNING'; + } elsif (($compare eq 'UNKNOWN') && ($current eq 'OK')) { + return 'UNKNOWN'; + } else { + return $current; + } +} + + +if ( $opt_s ) { + $hotspares = $opt_s; +} +if ( $opt_m ) { + $mediaallow = $opt_m; +} +if ( $opt_p ) { + $predallow = $opt_p; +} +if ( $opt_o ) { + $otherallow = $opt_o; +} + +# Get the number of RAID controllers we have +open (ADPCOUNT, "$megacli -adpCount -NoLog |") + || die "error: Could not execute MegaCli -adpCount"; + +while () { + if ( m/Controller Count:\s*(\d+)/ ) { + $adapters = $1; + last; + } +} +close ADPCOUNT; + +ADAPTER: for ( my $adp = 0; $adp < $adapters; $adp++ ) { + # Get the number of logical drives on this adapter + open (LDGETNUM, "$megacli -LdGetNum -a$adp -NoLog |") + || die "error: Could not execute $megacli -LdGetNum -a$adp"; + + my ($ldnum); + while () { + if ( m/Number of Virtual drives configured on adapter \d:\s*(\d+)/i ) { + $ldnum = $1; + last; + } + } + close LDGETNUM; + + LDISK: for ( my $ld = 0; $ld < $ldnum; $ld++ ) { + # Get info on this particular logical drive + open (LDINFO, "$megacli -LdInfo -L$ld -a$adp -NoLog |") + || die "error: Could not execute $megacli -LdInfo -L$ld -a$adp -NoLog"; + + my ($size, $unit, $raidlevel, $ldpdcount, $spandepth, $state); + while () { + if ( m/^Size\s*:\s*(\d+(\.\d+)?)\s*(MB|GB|TB)/ ) { + $size = $1; + $unit = $3; + # Adjust MB to GB if that's what we got + if ( $unit eq 'MB' ) { + $size = sprintf( "%.0f", ($size / 1024) ); + $unit= 'GB'; + } + } elsif ( m/^State\s*:\s*(\w+)/ ) { + $state = $1; + if ( $state ne 'Optimal' ) { + $status = 'CRITICAL'; + } + } elsif ( m/^Number Of Drives( per span)?\s*:\s*(\d+)/ ) { + $ldpdcount = $2; + } elsif ( m/^Span Depth\s*:\s*(\d+)/ ) { + $spandepth = $1; + $ldpdcount = $ldpdcount * $spandepth; + } elsif ( m/^RAID Level\s*:\s*Primary-(\d)/ ) { + $raidlevel = $1; + } + } + close LDINFO; + + $result .= "$adp:$ld:RAID-$raidlevel:$ldpdcount drives:$size$unit:$state "; + + } #LDISK + close LDINFO; + + # Get info on physical disks for this adapter + open (PDLIST, "$megacli -PdList -a$adp -NoLog |") + || die "error: Could not execute $megacli -PdList -a$adp -NoLog"; + + my ($slotnumber,$fwstate); + PDISKS: while () { + if ( m/Slot Number:\s*(\d+)/ ) { + $slotnumber = $1; + # Don't care about backplane error counts + next if ( $slotnumber == 255 ); + $pdcount++; + } elsif ( m/(\w+) Error Count:\s*(\d+)/ ) { + if ( $1 eq 'Media') { + $mediaerrors += $2; + } else { + $othererrors += $2; + } + } elsif ( m/Predictive Failure Count:\s*(\d+)/ ) { + $prederrors += $1; + } elsif ( m/Firmware state:\s*(\w+)/ ) { + $fwstate = $1; + if ( $fwstate =~ m/Hotspare/ ) { + $hotsparecount++; + } elsif ( $fwstate =~ m/^Online/ ) { + # Do nothing + } elsif ( $slotnumber != 255 ) { + $pdbad++; + $status = 'CRITICAL'; + } + } + } #PDISKS + close PDLIST; +} + +$result .= "Drives:$pdcount "; + +# Any bad disks? +if ( $pdbad ) { + $result .= "$pdbad Bad Drives "; +} + +my $errorcount = $mediaerrors + $prederrors + $othererrors; +# Were there any errors? +if ( $errorcount ) { + $result .= "($errorcount Errors) "; + if ( ( $mediaerrors > $mediaallow ) || + ( $prederrors > $predallow ) || + ( $othererrors > $otherallow ) ) { + $status = max_state($status, 'WARNING'); + } +} + +# Do we have as many hotspares as expected (if any) +if ( $hotspares ) { + if ( $hotsparecount < $hotspares ) { + $status = max_state($status, 'WARNING'); + $result .= "Hotspare(s):$hotsparecount (of $hotspares)"; + } else { + $result .= "Hotspare(s):$hotsparecount"; + } +} + +print STDOUT "$status: $result\n"; +exit $ERRORS{$status};