From b6d710d60769a3aca1be411d97f51b36ab8f42aa Mon Sep 17 00:00:00 2001 From: Napsty Date: Tue, 4 Jun 2019 19:05:57 +0200 Subject: [PATCH] Add multi attribute check, with multi warning thresholds --- check_smart.pl | 90 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 63 insertions(+), 27 deletions(-) diff --git a/check_smart.pl b/check_smart.pl index 2ae4cce..e2c6573 100755 --- a/check_smart.pl +++ b/check_smart.pl @@ -33,13 +33,14 @@ # May 5, 2018: Claudio Kuenzler - Check selftest log for errors using new parameter -s (rev 5.10) # Dec 27, 2018: Claudio Kuenzler - Add exclude list (-e) to ignore certain attributes (5.11) # Jan 8, 2019: Claudio Kuenzler - Fix 'Use of uninitialized value' warnings (5.11.1) +# Jun 4, 2019: Claudio Kuenzler - Add raw check list (-r) and warning thresholds (-w) (6.0) use strict; use Getopt::Long; use File::Basename qw(basename); my $basename = basename($0); -my $revision = '5.11.1'; +my $revision = '6.0'; use FindBin; use lib $FindBin::Bin; @@ -52,7 +53,7 @@ BEGIN $ENV{'BASH_ENV'}=''; $ENV{'ENV'}=''; -use vars qw($opt_b $opt_d $opt_g $opt_debug $opt_h $opt_i $opt_e $opt_s $opt_v); +use vars qw($opt_b $opt_d $opt_g $opt_debug $opt_h $opt_i $opt_e $opt_r $opt_s $opt_v $opt_w); Getopt::Long::Configure('bundling'); GetOptions( "debug" => \$opt_debug, @@ -62,8 +63,10 @@ BEGIN "h" => \$opt_h, "help" => \$opt_h, "i=s" => \$opt_i, "interface=s" => \$opt_i, "e=s" => \$opt_e, "exclude=s" => \$opt_e, + "r=s" => \$opt_r, "raw=s" => \$opt_r, "s" => \$opt_s, "selftest" => \$opt_s, "v" => \$opt_v, "version" => \$opt_v, + "w=s" => \$opt_w, "warn=s" => \$opt_v, ); if ($opt_v) { @@ -151,6 +154,26 @@ BEGIN my $exclude_list = $opt_e // ''; my @exclude_list = split /,/, $exclude_list; +# raw check list +my $raw_check_list = $opt_r // 'Current_Pending_Sector,Reallocated_Sector_Ct,Program_Fail_Cnt_Total,Uncorrectable_Error_Cnt,Offline_Uncorrectable,Runtime_Bad_Block'; +my @raw_check_list = split /,/, $raw_check_list; + +# warning threshold list (for raw checks) +my $warn_list = $opt_w // ''; +my @warn_list = split /,/, $warn_list; +my %warn_list; +my $warn_key; +my $warn_value; +foreach my $warn_element (@warn_list) { + ($warn_key, $warn_value) = split /=/, $warn_element; + $warn_list{ $warn_key } = $warn_value; +} + +# For backward compatibility, add -b parameter to warning thresholds +if ($opt_b) { + $warn_list{ 'Current_Pending_Sector' } = $opt_b; +} + foreach $device ( split(":",$device) ){ foreach $interface ( split(":",$interface) ){ my @error_messages = qw//; @@ -309,6 +332,11 @@ BEGIN @output = `$full_command`; warn "(debug) output:\n@output\n\n" if $opt_debug; my @perfdata = qw//; + warn "(debug) Raw Check List: $raw_check_list\n" if $opt_debug; + warn "(debug) Exclude List: $exclude_list\n" if $opt_debug; + warn "(debug) Warning Thresholds:\n" if $opt_debug; + for my $warnpair ( sort keys %warn_list ) { warn "$warnpair=$warn_list{$warnpair}\n" if $opt_debug; } + warn "\n" if $opt_debug; # separate metric-gathering and output analysis for ATA vs SCSI SMART output # Yeah - but megaraid is the same output as ata @@ -335,31 +363,37 @@ BEGIN } push (@perfdata, "$attribute_name=$raw_value") if $opt_d; - # do some manual checks for Current_Pending_Sector - if ( ($attribute_name eq 'Current_Pending_Sector') && (grep {$_ eq $attribute_name} @exclude_list) ) { - # Current_Pending_Sector is in ignore list, move on + # skip attribute if it was set to be ignored in exclude_list + if (grep {$_ eq $attribute_name} @exclude_list) { warn "SMART Attribute $attribute_name was set to be ignored\n" if $opt_debug; next; - } - if ( ($attribute_name eq 'Current_Pending_Sector') && $raw_value ) { - if ($opt_b) { - if (($raw_value > 0) && ($raw_value >= $opt_b)) { - push(@error_messages, "$raw_value Sectors pending re-allocation"); - escalate_status('WARNING'); - warn "(debug) Current_Pending_Sector is non-zero ($raw_value)\n\n" if $opt_debug; - } - elsif (($raw_value > 0) && ($raw_value < $opt_b)) { - push(@error_messages, "$raw_value Sectors pending re-allocation (but less than threshold $opt_b)"); - warn "(debug) Current_Pending_Sector is non-zero ($raw_value) but less than $opt_b\n\n" if $opt_debug; - } - } else { - push(@error_messages, "Sectors pending re-allocation"); - escalate_status('WARNING'); - warn "(debug) Current_Pending_Sector is non-zero ($raw_value)\n\n" if $opt_debug; - } + } else { + # manual checks on raw values for certain attributes deemed significant + if (grep {$_ eq $attribute_name} @raw_check_list) { + if ($raw_value > 0) { + # Check for warning thresholds + if ( ($warn_list{$attribute_name}) && ($raw_value >= $warn_list{$attribute_name}) ) { + warn "(debug) $attribute_name is non-zero ($raw_value)\n\n" if $opt_debug; + push(@error_messages, "$attribute_name is non-zero ($raw_value)"); + escalate_status('WARNING'); + } elsif ( ($warn_list{$attribute_name}) && ($raw_value < $warn_list{$attribute_name}) ) { + warn "(debug) $attribute_name is non-zero ($raw_value) but less than $warn_list{$attribute_name}\n\n" if $opt_debug; + push(@error_messages, "$attribute_name is non-zero ($raw_value) (but less than threshold $warn_list{$attribute_name})"); + } + else { + warn "(debug) $attribute_name is non-zero ($raw_value)\n\n" if $opt_debug; + push(@error_messages, "$attribute_name is non-zero ($raw_value)"); + escalate_status('WARNING'); + } + } else { + warn "(debug) $attribute_name is OK ($raw_value)\n\n" if $opt_debug; + } + } else { + warn "(debug) $attribute_name not in raw check list (raw value: $raw_value)\n\n" if $opt_debug; + } + } } - } else { my ($current_temperature, $max_temperature, $current_start_stop, $max_start_stop) = qw//; foreach my $line(@output){ @@ -467,7 +501,7 @@ BEGIN sub print_help { print_revision($basename,$revision); - print "\nUsage: $basename {-d=|-g=} -i=(auto|ata|scsi|3ware,N|areca,N|hpt,L/M/N|cciss,N|megaraid,N) [-b N] [-e list] [--debug]\n\n"; + print "\nUsage: $basename {-d=|-g=} -i=(auto|ata|scsi|3ware,N|areca,N|hpt,L/M/N|cciss,N|megaraid,N) [-r list] [-w list] [-b N] [-e list] [--debug]\n\n"; print "At least one of the below. -d supersedes -g\n"; print " -d/--device: a physical block device to be SMART monitored, eg /dev/sda\n"; print " -g/--global: a regular expression name of physical devices to be SMART monitored\n"; @@ -475,10 +509,12 @@ sub print_help { print "Note that -g only works with a fixed interface input (e.g. scsi, ata), not with special interface ids like cciss,1\n"; print "\n"; print "Other options\n"; - print " -i/--interface: device's interface type\n"; + print " -i/--interface: device's interface type (auto|ata|scsi|3ware,N|areca,N|hpt,L/M/N|cciss,N|megaraid,N)\n"; print " (See http://www.smartmontools.org/wiki/Supported_RAID-Controllers for interface convention)\n"; - print " -b/--bad: Threshold value (integer) when to warn for N bad entries\n"; - print " -e/--exclude: List of (comma separated) SMART attributes which should be excluded (=ignored)\n"; + print " -r/--raw Comma separated list of ATA attributes to check (default: Current_Pending_Sector,Reallocated_Sector_Ct,Program_Fail_Cnt_Total,Uncorrectable_Error_Cnt,Offline_Uncorrectable,Runtime_Bad_Block)\n"; + print " -b/--bad: Threshold value for Current_Pending_Sector for ATA and 'grown defect list' for SCSI drives\n"; + print " -w/--warn Comma separated list of thresholds for ATA drives (e.g. Reallocated_Sector_Ct=10,Current_Pending_Sector=62)\n"; + print " -e/--exclude: Comma separated list of SMART attributes which should be excluded (=ignored)\n"; print " -s/--selftest: Enable self-test log check"; print " -h/--help: this help\n"; print " --debug: show debugging information\n";