From df7a8b341c7d1c2b75dc22dc58ce0aee380cd682 Mon Sep 17 00:00:00 2001
From: Matt Birkholz <matt@birchwood-abbey.net>
Date: Sun, 23 Nov 2025 12:31:46 -0700
Subject: [PATCH] Add mdstat monitoring to NAGIOS, to monitor Home on Core.

---
 README.org                            | 111 +++++++++++++++++++++++++-
 roles_t/abbey-core/files/check_mdstat |  73 +++++++++++++++++
 roles_t/abbey-core/tasks/main.yml     |  17 ++++
 3 files changed, 197 insertions(+), 4 deletions(-)
 create mode 100644 roles_t/abbey-core/files/check_mdstat

diff --git a/README.org b/README.org
index f0f8c3d..2e426b6 100644
--- a/README.org
+++ b/README.org
@@ -1015,15 +1015,17 @@ with an initial smattering of monitors adopted from the Debian
 ~monitoring-plugins~ package.  Thus a NAGIOS4 server on the abbey's
 Core monitors core network services, and uses ~nagios-nrpe-server~ to
 monitor Gate.  The abbey adds several more monitors, installing
-additional configuration files in =/etc/nagios4/conf.d/=, and another
-customized ~check_sensors~ plugin (~abbey_pisensors~) in
-=/usr/local/sbin/= on the Raspberry Pis.
+additional configuration files in =/etc/nagios4/conf.d/=, a
+~check_mdstat~ plugin from ~https://exchange.nagios.org/~ on Core, and
+another customized ~check_sensors~ plugin (~abbey_pisensors~) on the
+Raspberry Pis.
 
 *** Monitoring The Home Disk
 
 The abbey adds monitoring of the space remaining on the volume at
 =/home/= on Core.  (The small institute only monitors the space
-remaining on roots.)
+remaining on roots.)  The abbey also monitors of the state of the
+RAID-5 array under =/home/=.
 
 #+CAPTION: [[file:roles_t/abbey-core/tasks/main.yml][=roles_t/abbey-core/tasks/main.yml=]]
 #+BEGIN_SRC conf :tangle roles_t/abbey-core/tasks/main.yml
@@ -1038,8 +1040,25 @@ remaining on roots.)
           service_description     Home Partition
           check_command           check_local_disk!20%!10%!/home
       }
+      define service {
+          use                     local-service
+          host_name               core
+          service_description     Home RAID
+          check_command           check_mdstat!md0!3
+      }
+      define command {
+          command_name            check_mdstat
+          command_line            /usr/local/sbin/check_mdstat $ARG1$ $ARG2$
+      }
     dest: /etc/nagios4/conf.d/abbey.cfg
   notify: Reload NAGIOS4.
+
+- name: Install NAGIOS monitor check_mdstat.
+  become: yes
+  copy:
+    src: ../abbey-core/files/check_mdstat
+    dest: /usr/local/sbin/check_mdstat
+    mode: u=rwx,g=rx,o=rx
 #+END_SRC
 
 #+CAPTION: [[file:roles_t/abbey-core/handlers/main.yml][=roles_t/abbey-core/handlers/main.yml=]]
@@ -1145,6 +1164,90 @@ case "$1" in
 esac
 #+END_SRC
 
+*** Stolen NAGIOS Monitor ~check_mdstat~
+
+This ~check_mdstat~ plugin was copied from the NAGIOS Exchange ([[https://exchange.nagios.org/directory/plugins/operating-systems/linux/check_mdstat/details/][here]]).
+It detects a failing disk in a multi-disk array.
+
+#+CAPTION: [[file:roles_t/abbey-core/files/check_mdstat][=roles_t/abbey-core/files/check_mdstat=]]
+#+BEGIN_SRC sh :tangle roles_t/abbey-core/files/check_mdstat
+#!/usr/bin/env bash
+
+# nagios script checks for failed raid device
+# linux software raid /proc/mdstat
+# karl@webmedianow.com 2013-10-01
+
+STATE_OK=0
+STATE_WARNING=1
+STATE_CRITICAL=2
+STATE_UNKNOWN=3
+STATE_DEPENDENT=4
+
+PATH=/bin:/usr/bin:/sbin:/usr/sbin
+export PATH
+
+usage() {
+cat <<-EOE
+Usage: $0 mdadm_device total_drives
+
+  mdadm_device is md0, md1, etc...
+  total_drives is 2 for mirror, or 3, 4 etc...
+
+Nagios script to check if failed drive in /proc/mdstat
+
+Example: raid 2 (2 disk mirror)
+  /opt/nagios/libexec/check_mdstat.sh md0 2
+
+Example: raid 5 with 8 disks
+  /opt/nagios/libexec/check_mdstat.sh md0 8
+
+EOE
+exit $STATE_UNKNOWN
+}
+
+if [ $# -lt 2 ]; then
+  usage
+fi
+
+cmd_device="$1"
+drive_num="$2"
+
+U=""
+for i in $(seq 1 $drive_num);
+do
+  U="${U}U"
+done
+
+uu="[${U}]"
+nn="[${drive_num}/${drive_num}]"
+
+#cat /proc/mdstat | grep -A 1 ^md1 | tail -1 | awk '{print ($(NF))}'
+# [UUUUUUUU] is OK raid
+# [_U] is Failed Drive
+
+# check if we have correct device...
+if cat /proc/mdstat | grep ^${cmd_device} | awk '{print $1}' | grep ^${cmd_device}$ >/dev/null 2>&1
+then
+  device=$cmd_device
+else
+  echo "Couldn't match $cmd_device"
+  exit $STATE_UNKNOWN 
+fi
+
+u_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF))}')
+n_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF-1))}')
+
+if [ $uu = $u_status ] && [ $nn = $n_status ]; then
+  echo "OK:  $device $n_status $u_status"
+  exit $STATE_OK
+else
+  echo "FAIL:  $device $n_status $u_status"
+  exit $STATE_CRITICAL
+fi
+
+
+#+END_SRC
+
 *** Configure NAGIOS Monitoring of The Cloister
 
 The abbey adds monitoring for more servers: Dantooine and Kessel.
diff --git a/roles_t/abbey-core/files/check_mdstat b/roles_t/abbey-core/files/check_mdstat
new file mode 100644
index 0000000..60a9fe6
--- /dev/null
+++ b/roles_t/abbey-core/files/check_mdstat
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+# nagios script checks for failed raid device
+# linux software raid /proc/mdstat
+# karl@webmedianow.com 2013-10-01
+
+STATE_OK=0
+STATE_WARNING=1
+STATE_CRITICAL=2
+STATE_UNKNOWN=3
+STATE_DEPENDENT=4
+
+PATH=/bin:/usr/bin:/sbin:/usr/sbin
+export PATH
+
+usage() {
+cat <<-EOE
+Usage: $0 mdadm_device total_drives
+
+  mdadm_device is md0, md1, etc...
+  total_drives is 2 for mirror, or 3, 4 etc...
+
+Nagios script to check if failed drive in /proc/mdstat
+
+Example: raid 2 (2 disk mirror)
+  /opt/nagios/libexec/check_mdstat.sh md0 2
+
+Example: raid 5 with 8 disks
+  /opt/nagios/libexec/check_mdstat.sh md0 8
+
+EOE
+exit $STATE_UNKNOWN
+}
+
+if [ $# -lt 2 ]; then
+  usage
+fi
+
+cmd_device="$1"
+drive_num="$2"
+
+U=""
+for i in $(seq 1 $drive_num);
+do
+  U="${U}U"
+done
+
+uu="[${U}]"
+nn="[${drive_num}/${drive_num}]"
+
+#cat /proc/mdstat | grep -A 1 ^md1 | tail -1 | awk '{print ($(NF))}'
+# [UUUUUUUU] is OK raid
+# [_U] is Failed Drive
+
+# check if we have correct device...
+if cat /proc/mdstat | grep ^${cmd_device} | awk '{print $1}' | grep ^${cmd_device}$ >/dev/null 2>&1
+then
+  device=$cmd_device
+else
+  echo "Couldn't match $cmd_device"
+  exit $STATE_UNKNOWN 
+fi
+
+u_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF))}')
+n_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF-1))}')
+
+if [ $uu = $u_status ] && [ $nn = $n_status ]; then
+  echo "OK:  $device $n_status $u_status"
+  exit $STATE_OK
+else
+  echo "FAIL:  $device $n_status $u_status"
+  exit $STATE_CRITICAL
+fi
diff --git a/roles_t/abbey-core/tasks/main.yml b/roles_t/abbey-core/tasks/main.yml
index d60f27d..5b602fb 100644
--- a/roles_t/abbey-core/tasks/main.yml
+++ b/roles_t/abbey-core/tasks/main.yml
@@ -199,9 +199,26 @@
           service_description     Home Partition
           check_command           check_local_disk!20%!10%!/home
       }
+      define service {
+          use                     local-service
+          host_name               core
+          service_description     Home RAID
+          check_command           check_mdstat!md0!3
+      }
+      define command {
+          command_name            check_mdstat
+          command_line            /usr/local/sbin/check_mdstat $ARG1$ $ARG2$
+      }
     dest: /etc/nagios4/conf.d/abbey.cfg
   notify: Reload NAGIOS4.
 
+- name: Install NAGIOS monitor check_mdstat.
+  become: yes
+  copy:
+    src: ../abbey-core/files/check_mdstat
+    dest: /usr/local/sbin/check_mdstat
+    mode: u=rwx,g=rx,o=rx
+
 - name: Configure cloister NAGIOS monitoring.
   become: yes
   template:
-- 
2.47.3