Compare commits

...

9 Commits

6 changed files with 392 additions and 2 deletions

View File

@ -1 +1 @@
0.2.171-1 ./
0.2.172-1 ./

View File

@ -4,7 +4,7 @@
Summary: Scripts for Zabbix monitoring
Name: zabbix-agent-addons
Version: 0.2.171
Version: 0.2.172
Release: 1%{?dist}
Source0: %{name}-%{version}.tar.gz
BuildArch: noarch
@ -106,6 +106,16 @@ fi
%endif
%changelog
* Thu Dec 21 2023 Daniel Berteaud <dbd@ehtrace.com> 0.2.172-1
- Add Zabbix template for storageDevices (dbd@ehtrace.com)
- Read raw value for SSL_Life_Left (dbd@ehtrace.com)
- Read SSD_Life_Left if available (dbd@ehtrace.com)
- /dev/bus/0 might not exist but can be queried (dbd@ehtrace.com)
- Report more info from some NVMe (dbd@ehtrace.com)
- Adjust default values for stor dev (dbd@ehtrace.com)
- Fix UserParam (dbd@ehtrace.com)
- Add new script for smart monitoring (dbd@ehtrace.com)
* Tue Sep 19 2023 Daniel Berteaud <dbd@ehtrace.com> 0.2.171-1
- Ignore samba NT_STATUS_PROTOCOL_UNREACHABLE errors (dbd@ehtrace.com)

View File

@ -5,3 +5,7 @@ UserParameter=hardware.disk.smart.discovery,/usr/bin/sudo /var/lib/zabbix/bin/di
# Takes two args: the drives to check, and the value to get
# eg: hardward.disk.smart[/dev/sda,Reallocated_Sector_Ct]
UserParameter=hardware.disk.smart[*],/usr/bin/sudo /var/lib/zabbix/bin/check_smart_sudo $1 $2
# New smart disk discovery/monitoring
UserParameter=stor.dev.discovery[*],/usr/bin/sudo /var/lib/zabbix/bin/disco_stor_dev_sudo
UserParameter=stor.dev.info[*],/usr/bin/sudo /var/lib/zabbix/bin/check_stor_dev_sudo --dev "$1" --type "$2"

View File

@ -0,0 +1,120 @@
#!/usr/bin/perl
use strict;
use warnings;
use JSON;
use Getopt::Long;
use File::Which;
my $dev = undef;
my $type = 'auto';
my $what = 'json';
my $pretty = 0;
GetOptions(
'device=s' => \$dev,
'type=s' => \$type,
'what=s' => \$what,
'pretty' => \$pretty
);
if (not defined $dev or $dev !~ m|^/dev/\w+(/\w+)*$|){
print "Invalid --device\n";
exit 1;
} elsif ($what !~ m/^\w+$/){
print "Invalid --what\n";
exit 1;
} elsif ($type !~ m/^\w+\+*\w+(,\w+)*$/){
print "Invalid --type\n";
exit 1;
}
my $json = {
temperature_celsius => 25,
power_on_hours => 0,
power_cycle_count => 0,
reallocated_sector_count => 0,
current_pending_sector => 0,
offline_uncorrectable => 0,
percent_lifetime_remain => 100,
firmware_version => 0
};
my $smartctl = which('smartctl');
sub print_out {
if ($what eq 'json'){
print to_json($json, { pretty => $pretty });
exit 0;
} elsif (defined $json->{$what}){
print $json->{$what} . "\n";
exit 0;
} else {
print "ZBX_NOTSUPPORTED\n";
exit 1;
}
}
sub get_smart_attr {
my $smart = shift;
my $attr = shift;
if (defined $smart->{ata_smart_attributes}->{table}){
foreach (@{$smart->{ata_smart_attributes}->{table}}){
if ($_->{name} eq $attr){
return $_;
}
}
}
return undef;
}
if (not defined $smartctl){
$what = 'error';
print_out();
}
my $data = from_json(qx($smartctl -a $dev -d $type --json=c));
if (defined $data->{temperature}->{current}){
$json->{temperature_celsius} = $data->{temperature}->{current};
}
if (defined $data->{power_on_time}->{hours}){
$json->{power_on_hours} = $data->{power_on_time}->{hours};
}
if (defined $data->{power_cycle_count}){
$json->{power_cycle_count} = $data->{power_cycle_count};
}
if (defined $data->{firmware_version}){
$json->{firmware_version} = $data->{firmware_version};
}
my ($pending, $realloc, $offline, $remain);
if ($pending = get_smart_attr($data, 'Current_Pending_Sector')){
$json->{current_pending_sector} = $pending->{raw}->{value};
}
if ($realloc = get_smart_attr($data, 'Reallocated_Sector_Ct') || get_smart_attr($data, 'Reallocated_Event_Count')){
$json->{reallocated_sector_count} = $realloc->{raw}->{value};
} elsif (defined $data->{nvme_smart_health_information_log}->{media_errors}){
# NMVe can report media error, so report it as reallocated sectors
$json->{reallocated_sector_count} = $data->{nvme_smart_health_information_log}->{media_errors};
}
if ($offline = get_smart_attr($data, 'Offline_Uncorrectable')){
$json->{offline_uncorrectable} = $offline->{raw}->{value};
}
if ($remain = get_smart_attr($data, 'Percent_Lifetime_Remain')){
$json->{percent_lifetime_remain} = $remain->{value};
} elsif ($remain = get_smart_attr($data, 'SSD_Life_Left')){
$json->{percent_lifetime_remain} = $remain->{raw}->{value};
} elsif ($remain = get_smart_attr($data, 'Wear_Leveling_Count')){
$json->{percent_lifetime_remain} = $remain->{value};
} elsif (defined $data->{nvme_smart_health_information_log}->{percentage_used}){
# NMVe sometime report the estimated life used, instead of the remaining
$json->{percent_lifetime_remain} = 100 - $data->{nvme_smart_health_information_log}->{percentage_used};
}
print_out();

View File

@ -0,0 +1,61 @@
#!/usr/bin/perl
use strict;
use JSON;
use Getopt::Long;
use Data::Dumper;
use File::Which;
my $pretty = 0;
GetOptions(
'pretty' => \$pretty
);
my $smartctl = which('smartctl');
my $json = [];
sub print_out {
print to_json($json, { pretty => $pretty });
}
if (not defined $smartctl){
print_out();
exit 0;
}
my $smart_scan = from_json(qx($smartctl --scan-open --json=c));
if (not defined $smart_scan){
print_out();
exit 0;
}
foreach my $device (@{$smart_scan->{devices}}){
my ($model, $sn, $has_smart) = "";
my $smart_info = from_json(qx($smartctl -i $device->{name} -d $device->{type} --json=c));
if (defined $smart_info){
$model = $smart_info->{model_name};
$sn = $smart_info->{serial_number};
$has_smart = (
$smart_info->{in_smartctl_database} or (
defined $smart_info->{smart_support} and
$smart_info->{smart_support}->{available} and
$smart_info->{smart_support}->{enabled}
)
) ? 1 : 0;
}
push @{$json}, {
'{#STOR_DEV_NAME}' => $device->{name},
'{#STOR_DEV_DESC}' => $device->{info_name},
'{#STOR_DEV_TYPE}' => $device->{type},
'{#STOR_DEV_PROTO}' => $device->{protocol},
'{#STOR_DEV_MODEL}' => $model,
'{#STOR_DEV_SN}' => $sn,
'{#STOR_DEV_SMART}' => int $has_smart
};
}
print_out();

View File

@ -0,0 +1,195 @@
zabbix_export:
version: '6.0'
date: '2023-12-21T14:57:39Z'
groups:
- uuid: 7df96b18c230490a9a0a9e2307226338
name: Templates
templates:
- uuid: 98cb8260bbeb4a94a8b07b54608521c8
template: Template_HW_storageDevices
name: Template_HW_storageDevices
groups:
- name: Templates
discovery_rules:
- uuid: 12b9a456943f49aca440958063a5bfe2
name: 'SMART capable Storage devices discovery'
key: 'stor.dev.discovery[smart]'
delay: 1h
filter:
conditions:
- macro: '{#STOR_DEV_SMART}'
value: ^1$
formulaid: A
lifetime: 7d
item_prototypes:
- uuid: 0743506e1c1b49f5b47bd5cef5462c55
name: 'Stor dev {#STOR_DEV_SN} ({#STOR_DEV_DESC}): Pending sectors'
type: DEPENDENT
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},current_pending_sector]'
delay: '0'
history: 30d
value_type: FLOAT
units: '!sector(s)'
preprocessing:
- type: JSONPATH
parameters:
- $.current_pending_sector
master_item:
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE}]'
trigger_prototypes:
- uuid: 9ffa5efc061542e8a5f227936396e5ab
expression: 'change(/Template_HW_storageDevices/stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},current_pending_sector])>0'
recovery_mode: NONE
name: '{ITEM.LASTVALUE1} pending on {#STOR_DEV_SN} ({#STOR_DEV_DESC})'
priority: AVERAGE
manual_close: 'YES'
- uuid: 3da3d8491a024c7ab4f0bb3fc3887a64
name: 'Stor dev {#STOR_DEV_SN} ({#STOR_DEV_DESC}): Firmware version'
type: DEPENDENT
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},firmware_version]'
delay: '0'
history: 30d
trends: '0'
value_type: CHAR
preprocessing:
- type: JSONPATH
parameters:
- $.firmware_version
master_item:
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE}]'
- uuid: 6d4967ca28f549048a507209737fa7bd
name: 'Stor dev {#STOR_DEV_SN} ({#STOR_DEV_DESC}): Offline uncorrectable'
type: DEPENDENT
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},offline_uncorrectable]'
delay: '0'
history: 30d
value_type: FLOAT
units: '!sector(s)'
preprocessing:
- type: JSONPATH
parameters:
- $.offline_uncorrectable
master_item:
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE}]'
trigger_prototypes:
- uuid: 66c5fc3a96354eb3aae8d5a08eea4544
expression: 'change(/Template_HW_storageDevices/stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},offline_uncorrectable])>0'
recovery_mode: NONE
name: '{ITEM.LASTVALUE1} offline uncorrectable on {#STOR_DEV_SN} ({#STOR_DEV_DESC})'
priority: HIGH
manual_close: 'YES'
- uuid: df1ef0de1bc54200895a5d173a583905
name: 'Stor dev {#STOR_DEV_SN} ({#STOR_DEV_DESC}): Percent Lifetime Remain'
type: DEPENDENT
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},percent_lifetime_remain]'
delay: '0'
history: 30d
value_type: FLOAT
units: '%'
preprocessing:
- type: JSONPATH
parameters:
- $.percent_lifetime_remain
master_item:
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE}]'
trigger_prototypes:
- uuid: caaa6496f8e941a08f37c8746bb34bc2
expression: 'last(/Template_HW_storageDevices/stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},percent_lifetime_remain])<{$STOR_LIFETIME_REMAINING_CRITICAL:"{#STOR_DEV_SN}"}'
name: '{ITEM.LASTVALUE1} lifetime remaining for {#STOR_DEV_SN} ({#STOR_DEV_DESC})'
priority: HIGH
manual_close: 'YES'
- uuid: 1df675ca6c4649a081ea315599ce4284
expression: 'last(/Template_HW_storageDevices/stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},percent_lifetime_remain])<{$STOR_LIFETIME_REMAINING_WARN:"{#STOR_DEV_SN}"}'
name: '{ITEM.LASTVALUE1} lifetime remaining for {#STOR_DEV_SN} ({#STOR_DEV_DESC})'
priority: WARNING
manual_close: 'YES'
dependencies:
- name: '{ITEM.LASTVALUE1} lifetime remaining for {#STOR_DEV_SN} ({#STOR_DEV_DESC})'
expression: 'last(/Template_HW_storageDevices/stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},percent_lifetime_remain])<{$STOR_LIFETIME_REMAINING_CRITICAL:"{#STOR_DEV_SN}"}'
- uuid: 70dd4bcd3994423594f7330bbb448038
name: 'Stor dev {#STOR_DEV_SN} ({#STOR_DEV_DESC}): Power Cycle Count'
type: DEPENDENT
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},power_cycle_count]'
delay: '0'
history: 30d
value_type: FLOAT
preprocessing:
- type: JSONPATH
parameters:
- $.power_cycle_count
master_item:
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE}]'
- uuid: f68f569400954636a7e4231c97c57c81
name: 'Stor dev {#STOR_DEV_SN} ({#STOR_DEV_DESC}): Power on hours'
type: DEPENDENT
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},power_on_hours]'
delay: '0'
history: 30d
value_type: FLOAT
units: s
preprocessing:
- type: JSONPATH
parameters:
- $.power_on_hours
- type: MULTIPLIER
parameters:
- '3600'
master_item:
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE}]'
- uuid: 76e724749c304d7da9c38b81bfb9b711
name: 'Stor dev {#STOR_DEV_SN} ({#STOR_DEV_DESC}): Reallocated Sectors'
type: DEPENDENT
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},reallocated_sector_count]'
delay: '0'
history: 30d
value_type: FLOAT
units: '!sector(s)'
preprocessing:
- type: JSONPATH
parameters:
- $.reallocated_sector_count
master_item:
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE}]'
trigger_prototypes:
- uuid: f7b3fa0f6a8347e3b67387da531a2d1a
expression: 'change(/Template_HW_storageDevices/stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},reallocated_sector_count])>0'
recovery_mode: NONE
name: '{ITEM.LASTVALUE1} reallocated on {#STOR_DEV_SN} ({#STOR_DEV_DESC})'
priority: WARNING
manual_close: 'YES'
dependencies:
- name: '{ITEM.LASTVALUE1} reallocated on {#STOR_DEV_SN} ({#STOR_DEV_DESC})'
expression: 'last(/Template_HW_storageDevices/stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},reallocated_sector_count])>{$STOR_REALLOC_SECTOR_WARN:"{#STOR_DEV_SN}"}'
- uuid: 85d6eb29ca7b4175bf8ad15414a44936
expression: 'last(/Template_HW_storageDevices/stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},reallocated_sector_count])>{$STOR_REALLOC_SECTOR_WARN:"{#STOR_DEV_SN}"}'
name: '{ITEM.LASTVALUE1} reallocated on {#STOR_DEV_SN} ({#STOR_DEV_DESC})'
priority: AVERAGE
manual_close: 'YES'
- uuid: 28117d0d998f467ea542821e4a6507c9
name: 'Stor dev {#STOR_DEV_SN} ({#STOR_DEV_DESC}): Temperature'
type: DEPENDENT
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE},temperature_celsius]'
delay: '0'
history: 30d
value_type: FLOAT
units: °C
preprocessing:
- type: JSONPATH
parameters:
- $.temperature_celsius
master_item:
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE}]'
- uuid: 6d3da698590e43f4be62d5daab7f72f0
name: 'Stor dev {#STOR_DEV_SN} ({#STOR_DEV_DESC}) info'
key: 'stor.dev.info[{#STOR_DEV_NAME},{#STOR_DEV_TYPE}]'
delay: 8m
history: '0'
trends: '0'
value_type: TEXT
macros:
- macro: '{$STOR_LIFETIME_REMAINING_CRITICAL}'
value: '6'
- macro: '{$STOR_LIFETIME_REMAINING_WARN}'
value: '10'
- macro: '{$STOR_REALLOC_SECTOR_WARN}'
value: '40'