#!/usr/bin/perl -w # check_megaraid_sas Nagios plugin # Copyright (C) 2007 Jonathan Delgado, delgado@molbio.mgh.harvard.edu # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # # # Nagios plugin to monitor the status of volumes attached to a LSI Megaraid SAS # controller, such as the Dell PERC5/i and PERC5/e. If you have any hotspares # attached to the controller, you can specify the number you should expect to # find with the '-s' flag. # # The paths for the Nagios plugins lib and MegaCli may need to me changed. # # $Author: delgado $ # $Revision: #3 $ $Date: 2007/06/07 $ # Slightly modified by Daniel B. for zabbix # 23 Apr 2009 use strict; use Getopt::Std; our($opt_h, $opt_s, $opt_o, $opt_m, $opt_p); getopts('hs:o:p:m:'); if ( $opt_h ) { print "Usage: $0 [-s number] [-m number] [-o number]\n"; print " -s is how many hotspares are attached to the controller\n"; print " -m is the number of media errors to ignore\n"; print " -p is the predictive error count to ignore\n"; print " -o is the number of other disk errors to ignore\n"; exit; } my $megacli = (-x '/opt/MegaRAID/MegaCli/MegaCli64') ? '/opt/MegaRAID/MegaCli/MegaCli64' : '/opt/MegaRAID/MegaCli/MegaCli'; ## Return codes for Nagios my %ERRORS=('OK'=>0,'WARNING'=>1,'CRITICAL'=>2,'UNKNOWN'=>3,'DEPENDENT'=>4); my ($adapters); my $hotspares = 0; my $hotsparecount = 0; my $pdbad = 0; my $pdcount = 0; my $mediaerrors = 0; my $mediaallow = 0; my $prederrors = 0; my $predallow = 0; my $othererrors = 0; my $otherallow = 0; my $result = ''; my $status = 'OK'; sub max_state ($$) { my ($current, $compare) = @_; if (($compare eq 'CRITICAL') || ($current eq 'CRITICAL')) { return 'CRITICAL'; } elsif ($compare eq 'OK') { return $current; } elsif ($compare eq 'WARNING') { return 'WARNING'; } elsif (($compare eq 'UNKNOWN') && ($current eq 'OK')) { return 'UNKNOWN'; } else { return $current; } } if ( $opt_s ) { $hotspares = $opt_s; } if ( $opt_m ) { $mediaallow = $opt_m; } if ( $opt_p ) { $predallow = $opt_p; } if ( $opt_o ) { $otherallow = $opt_o; } # Get the number of RAID controllers we have open (ADPCOUNT, "$megacli -adpCount -NoLog |") || die "error: Could not execute MegaCli -adpCount"; while (<ADPCOUNT>) { if ( m/Controller Count:\s*(\d+)/ ) { $adapters = $1; last; } } close ADPCOUNT; ADAPTER: for ( my $adp = 0; $adp < $adapters; $adp++ ) { # Get the number of logical drives on this adapter open (LDGETNUM, "$megacli -LdGetNum -a$adp -NoLog |") || die "error: Could not execute $megacli -LdGetNum -a$adp"; my ($ldnum); while (<LDGETNUM>) { if ( m/Number of Virtual drives configured on adapter \d:\s*(\d+)/i ) { $ldnum = $1; last; } } close LDGETNUM; open (CFGDSPLY, "$megacli -CfgDsply -a$adp -NoLog |") || die "error: Could not execute $megacli -CfgDsply -a$adp -NoLog"; my $hba = 0; my $failgrouplist = 0; while (<CFGDSPLY>) { if ( m/Failed to get Disk Group list/ ) { $failgrouplist = 1; } if ( m/Product Name:.*(JBOD|HBA)/ ) { $hba = 1; } } close CFGDSPLY; # When controller is in HBA/JBOD mode, skip RAID volume checks unless ($hba && $failgrouplist) { LDISK: for ( my $ld = 0; $ld < $ldnum; $ld++ ) { # Get info on this particular logical drive open (LDINFO, "$megacli -LdInfo -L$ld -a$adp -NoLog |") || die "error: Could not execute $megacli -LdInfo -L$ld -a$adp -NoLog"; my ($size, $unit, $raidlevel, $ldpdcount, $spandepth, $state); while (<LDINFO>) { if ( m/^Size\s*:\s*(\d+(\.\d+)?)\s*(MB|GB|TB)/ ) { $size = $1; $unit = $3; # Adjust MB to GB if that's what we got if ( $unit eq 'MB' ) { $size = sprintf( "%.0f", ($size / 1024) ); $unit= 'GB'; } } elsif ( m/^State\s*:\s*(\w+(\s\w+)?)/ ) { $state = $1; if ( $state ne 'Optimal' ) { $status = 'CRITICAL'; } } elsif ( m/^Number Of Drives( per span)?\s*:\s*(\d+)/ ) { $ldpdcount = $2; } elsif ( m/^Span Depth\s*:\s*(\d+)/ ) { $spandepth = $1; $ldpdcount = $ldpdcount * $spandepth; } elsif ( m/^RAID Level\s*:\s*Primary-(\d)/ ) { $raidlevel = $1; } } close LDINFO; $result .= "$adp:$ld:RAID-$raidlevel:$ldpdcount drives:$size$unit:$state "; } #LDISK close LDINFO; } # Get info on physical disks for this adapter open (PDLIST, "$megacli -PdList -a$adp -NoLog |") || die "error: Could not execute $megacli -PdList -a$adp -NoLog"; my ($slotnumber,$fwstate); PDISKS: while (<PDLIST>) { if ( m/Slot Number:\s*(\d+)/ ) { $slotnumber = $1; # Don't care about backplane error counts next if ( $slotnumber == 255 ); $pdcount++; } elsif ( m/(\w+) Error Count:\s*(\d+)/ ) { if ( $1 eq 'Media') { $mediaerrors += $2; } else { $othererrors += $2; } } elsif ( m/Predictive Failure Count:\s*(\d+)/ ) { $prederrors += $1; } elsif ( m/Firmware state:\s*(\w+)/ ) { $fwstate = $1; if ( $fwstate =~ m/Hotspare/ ) { $hotsparecount++; } elsif ( $fwstate =~ m/^Online/ ) { # Do nothing } elsif ( $slotnumber != 255 ) { $pdbad++; $status = 'CRITICAL'; } } } #PDISKS close PDLIST; } $result .= "Drives:$pdcount "; # Any bad disks? if ( $pdbad ) { $result .= "$pdbad Bad Drives "; } my $errorcount = $mediaerrors + $prederrors + $othererrors; # Were there any errors? if ( $errorcount ) { $result .= "($errorcount Errors) "; if ( ( $mediaerrors > $mediaallow ) || ( $prederrors > $predallow ) || ( $othererrors > $otherallow ) ) { $status = max_state($status, 'WARNING'); } } # Do we have as many hotspares as expected (if any) if ( $hotspares ) { if ( $hotsparecount < $hotspares ) { $status = max_state($status, 'WARNING'); $result .= "Hotspare(s):$hotsparecount (of $hotspares)"; } else { $result .= "Hotspare(s):$hotsparecount"; } } print STDOUT "$status: $result\n"; exit $ERRORS{$status};