From df7a8b341c7d1c2b75dc22dc58ce0aee380cd682 Mon Sep 17 00:00:00 2001 From: Matt Birkholz Date: Sun, 23 Nov 2025 12:31:46 -0700 Subject: [PATCH] Add mdstat monitoring to NAGIOS, to monitor Home on Core. --- README.org | 111 +++++++++++++++++++++++++- roles_t/abbey-core/files/check_mdstat | 73 +++++++++++++++++ roles_t/abbey-core/tasks/main.yml | 17 ++++ 3 files changed, 197 insertions(+), 4 deletions(-) create mode 100644 roles_t/abbey-core/files/check_mdstat diff --git a/README.org b/README.org index f0f8c3d..2e426b6 100644 --- a/README.org +++ b/README.org @@ -1015,15 +1015,17 @@ with an initial smattering of monitors adopted from the Debian ~monitoring-plugins~ package. Thus a NAGIOS4 server on the abbey's Core monitors core network services, and uses ~nagios-nrpe-server~ to monitor Gate. The abbey adds several more monitors, installing -additional configuration files in =/etc/nagios4/conf.d/=, and another -customized ~check_sensors~ plugin (~abbey_pisensors~) in -=/usr/local/sbin/= on the Raspberry Pis. +additional configuration files in =/etc/nagios4/conf.d/=, a +~check_mdstat~ plugin from ~https://exchange.nagios.org/~ on Core, and +another customized ~check_sensors~ plugin (~abbey_pisensors~) on the +Raspberry Pis. *** Monitoring The Home Disk The abbey adds monitoring of the space remaining on the volume at =/home/= on Core. (The small institute only monitors the space -remaining on roots.) +remaining on roots.) The abbey also monitors of the state of the +RAID-5 array under =/home/=. #+CAPTION: [[file:roles_t/abbey-core/tasks/main.yml][=roles_t/abbey-core/tasks/main.yml=]] #+BEGIN_SRC conf :tangle roles_t/abbey-core/tasks/main.yml @@ -1038,8 +1040,25 @@ remaining on roots.) service_description Home Partition check_command check_local_disk!20%!10%!/home } + define service { + use local-service + host_name core + service_description Home RAID + check_command check_mdstat!md0!3 + } + define command { + command_name check_mdstat + command_line /usr/local/sbin/check_mdstat $ARG1$ $ARG2$ + } dest: /etc/nagios4/conf.d/abbey.cfg notify: Reload NAGIOS4. + +- name: Install NAGIOS monitor check_mdstat. + become: yes + copy: + src: ../abbey-core/files/check_mdstat + dest: /usr/local/sbin/check_mdstat + mode: u=rwx,g=rx,o=rx #+END_SRC #+CAPTION: [[file:roles_t/abbey-core/handlers/main.yml][=roles_t/abbey-core/handlers/main.yml=]] @@ -1145,6 +1164,90 @@ case "$1" in esac #+END_SRC +*** Stolen NAGIOS Monitor ~check_mdstat~ + +This ~check_mdstat~ plugin was copied from the NAGIOS Exchange ([[https://exchange.nagios.org/directory/plugins/operating-systems/linux/check_mdstat/details/][here]]). +It detects a failing disk in a multi-disk array. + +#+CAPTION: [[file:roles_t/abbey-core/files/check_mdstat][=roles_t/abbey-core/files/check_mdstat=]] +#+BEGIN_SRC sh :tangle roles_t/abbey-core/files/check_mdstat +#!/usr/bin/env bash + +# nagios script checks for failed raid device +# linux software raid /proc/mdstat +# karl@webmedianow.com 2013-10-01 + +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 +STATE_DEPENDENT=4 + +PATH=/bin:/usr/bin:/sbin:/usr/sbin +export PATH + +usage() { +cat <<-EOE +Usage: $0 mdadm_device total_drives + + mdadm_device is md0, md1, etc... + total_drives is 2 for mirror, or 3, 4 etc... + +Nagios script to check if failed drive in /proc/mdstat + +Example: raid 2 (2 disk mirror) + /opt/nagios/libexec/check_mdstat.sh md0 2 + +Example: raid 5 with 8 disks + /opt/nagios/libexec/check_mdstat.sh md0 8 + +EOE +exit $STATE_UNKNOWN +} + +if [ $# -lt 2 ]; then + usage +fi + +cmd_device="$1" +drive_num="$2" + +U="" +for i in $(seq 1 $drive_num); +do + U="${U}U" +done + +uu="[${U}]" +nn="[${drive_num}/${drive_num}]" + +#cat /proc/mdstat | grep -A 1 ^md1 | tail -1 | awk '{print ($(NF))}' +# [UUUUUUUU] is OK raid +# [_U] is Failed Drive + +# check if we have correct device... +if cat /proc/mdstat | grep ^${cmd_device} | awk '{print $1}' | grep ^${cmd_device}$ >/dev/null 2>&1 +then + device=$cmd_device +else + echo "Couldn't match $cmd_device" + exit $STATE_UNKNOWN +fi + +u_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF))}') +n_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF-1))}') + +if [ $uu = $u_status ] && [ $nn = $n_status ]; then + echo "OK: $device $n_status $u_status" + exit $STATE_OK +else + echo "FAIL: $device $n_status $u_status" + exit $STATE_CRITICAL +fi + + +#+END_SRC + *** Configure NAGIOS Monitoring of The Cloister The abbey adds monitoring for more servers: Dantooine and Kessel. diff --git a/roles_t/abbey-core/files/check_mdstat b/roles_t/abbey-core/files/check_mdstat new file mode 100644 index 0000000..60a9fe6 --- /dev/null +++ b/roles_t/abbey-core/files/check_mdstat @@ -0,0 +1,73 @@ +#!/usr/bin/env bash + +# nagios script checks for failed raid device +# linux software raid /proc/mdstat +# karl@webmedianow.com 2013-10-01 + +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 +STATE_DEPENDENT=4 + +PATH=/bin:/usr/bin:/sbin:/usr/sbin +export PATH + +usage() { +cat <<-EOE +Usage: $0 mdadm_device total_drives + + mdadm_device is md0, md1, etc... + total_drives is 2 for mirror, or 3, 4 etc... + +Nagios script to check if failed drive in /proc/mdstat + +Example: raid 2 (2 disk mirror) + /opt/nagios/libexec/check_mdstat.sh md0 2 + +Example: raid 5 with 8 disks + /opt/nagios/libexec/check_mdstat.sh md0 8 + +EOE +exit $STATE_UNKNOWN +} + +if [ $# -lt 2 ]; then + usage +fi + +cmd_device="$1" +drive_num="$2" + +U="" +for i in $(seq 1 $drive_num); +do + U="${U}U" +done + +uu="[${U}]" +nn="[${drive_num}/${drive_num}]" + +#cat /proc/mdstat | grep -A 1 ^md1 | tail -1 | awk '{print ($(NF))}' +# [UUUUUUUU] is OK raid +# [_U] is Failed Drive + +# check if we have correct device... +if cat /proc/mdstat | grep ^${cmd_device} | awk '{print $1}' | grep ^${cmd_device}$ >/dev/null 2>&1 +then + device=$cmd_device +else + echo "Couldn't match $cmd_device" + exit $STATE_UNKNOWN +fi + +u_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF))}') +n_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF-1))}') + +if [ $uu = $u_status ] && [ $nn = $n_status ]; then + echo "OK: $device $n_status $u_status" + exit $STATE_OK +else + echo "FAIL: $device $n_status $u_status" + exit $STATE_CRITICAL +fi diff --git a/roles_t/abbey-core/tasks/main.yml b/roles_t/abbey-core/tasks/main.yml index d60f27d..5b602fb 100644 --- a/roles_t/abbey-core/tasks/main.yml +++ b/roles_t/abbey-core/tasks/main.yml @@ -199,9 +199,26 @@ service_description Home Partition check_command check_local_disk!20%!10%!/home } + define service { + use local-service + host_name core + service_description Home RAID + check_command check_mdstat!md0!3 + } + define command { + command_name check_mdstat + command_line /usr/local/sbin/check_mdstat $ARG1$ $ARG2$ + } dest: /etc/nagios4/conf.d/abbey.cfg notify: Reload NAGIOS4. +- name: Install NAGIOS monitor check_mdstat. + become: yes + copy: + src: ../abbey-core/files/check_mdstat + dest: /usr/local/sbin/check_mdstat + mode: u=rwx,g=rx,o=rx + - name: Configure cloister NAGIOS monitoring. become: yes template: -- 2.25.1