Add mdstat monitoring to NAGIOS, to monitor Home on Core.

author Matt Birkholz <matt@birchwood-abbey.net>

Sun, 23 Nov 2025 19:31:46 +0000 (12:31 -0700)

committer Matt Birkholz <matt@birchwood-abbey.net>

Sun, 23 Nov 2025 19:36:10 +0000 (12:36 -0700)
author Matt Birkholz <matt@birchwood-abbey.net>
Sun, 23 Nov 2025 19:31:46 +0000 (12:31 -0700)
committer Matt Birkholz <matt@birchwood-abbey.net>
Sun, 23 Nov 2025 19:36:10 +0000 (12:36 -0700)
diff --git a/README.org b/README.org

index f0f8c3d97c1c724eaa738552550022cf6d2acd44..2e426b6edb9936ad8dc0e624ce9ebd6582638a82 100644 (file)
--- a/README.org
+++ b/README.org
@@ -1015,15 +1015,17 @@ with an initial smattering of monitors adopted from the Debian
  ~monitoring-plugins~ package.  Thus a NAGIOS4 server on the abbey's
  Core monitors core network services, and uses ~nagios-nrpe-server~ to
  monitor Gate.  The abbey adds several more monitors, installing
-additional configuration files in =/etc/nagios4/conf.d/=, and another
-customized ~check_sensors~ plugin (~abbey_pisensors~) in
-=/usr/local/sbin/= on the Raspberry Pis.
+additional configuration files in =/etc/nagios4/conf.d/=, a
+~check_mdstat~ plugin from ~https://exchange.nagios.org/~ on Core, and
+another customized ~check_sensors~ plugin (~abbey_pisensors~) on the
+Raspberry Pis.
  
  *** Monitoring The Home Disk
  
  The abbey adds monitoring of the space remaining on the volume at
  =/home/= on Core.  (The small institute only monitors the space
-remaining on roots.)
+remaining on roots.)  The abbey also monitors of the state of the
+RAID-5 array under =/home/=.
  
  #+CAPTION: [[file:roles_t/abbey-core/tasks/main.yml][=roles_t/abbey-core/tasks/main.yml=]]
  #+BEGIN_SRC conf :tangle roles_t/abbey-core/tasks/main.yml
@@ -1038,8 +1040,25 @@ remaining on roots.)
            service_description     Home Partition
            check_command           check_local_disk!20%!10%!/home
        }
+      define service {
+          use                     local-service
+          host_name               core
+          service_description     Home RAID
+          check_command           check_mdstat!md0!3
+      }
+      define command {
+          command_name            check_mdstat
+          command_line            /usr/local/sbin/check_mdstat $ARG1$ $ARG2$
+      }
      dest: /etc/nagios4/conf.d/abbey.cfg
    notify: Reload NAGIOS4.
+
+- name: Install NAGIOS monitor check_mdstat.
+  become: yes
+  copy:
+    src: ../abbey-core/files/check_mdstat
+    dest: /usr/local/sbin/check_mdstat
+    mode: u=rwx,g=rx,o=rx
  #+END_SRC
  
  #+CAPTION: [[file:roles_t/abbey-core/handlers/main.yml][=roles_t/abbey-core/handlers/main.yml=]]
@@ -1145,6 +1164,90 @@ case "$1" in
  esac
  #+END_SRC
  
+*** Stolen NAGIOS Monitor ~check_mdstat~
+
+This ~check_mdstat~ plugin was copied from the NAGIOS Exchange ([[https://exchange.nagios.org/directory/plugins/operating-systems/linux/check_mdstat/details/][here]]).
+It detects a failing disk in a multi-disk array.
+
+#+CAPTION: [[file:roles_t/abbey-core/files/check_mdstat][=roles_t/abbey-core/files/check_mdstat=]]
+#+BEGIN_SRC sh :tangle roles_t/abbey-core/files/check_mdstat
+#!/usr/bin/env bash
+
+# nagios script checks for failed raid device
+# linux software raid /proc/mdstat
+# karl@webmedianow.com 2013-10-01
+
+STATE_OK=0
+STATE_WARNING=1
+STATE_CRITICAL=2
+STATE_UNKNOWN=3
+STATE_DEPENDENT=4
+
+PATH=/bin:/usr/bin:/sbin:/usr/sbin
+export PATH
+
+usage() {
+cat <<-EOE
+Usage: $0 mdadm_device total_drives
+
+  mdadm_device is md0, md1, etc...
+  total_drives is 2 for mirror, or 3, 4 etc...
+
+Nagios script to check if failed drive in /proc/mdstat
+
+Example: raid 2 (2 disk mirror)
+  /opt/nagios/libexec/check_mdstat.sh md0 2
+
+Example: raid 5 with 8 disks
+  /opt/nagios/libexec/check_mdstat.sh md0 8
+
+EOE
+exit $STATE_UNKNOWN
+}
+
+if [ $# -lt 2 ]; then
+  usage
+fi
+
+cmd_device="$1"
+drive_num="$2"
+
+U=""
+for i in $(seq 1 $drive_num);
+do
+  U="${U}U"
+done
+
+uu="[${U}]"
+nn="[${drive_num}/${drive_num}]"
+
+#cat /proc/mdstat | grep -A 1 ^md1 | tail -1 | awk '{print ($(NF))}'
+# [UUUUUUUU] is OK raid
+# [_U] is Failed Drive
+
+# check if we have correct device...
+if cat /proc/mdstat | grep ^${cmd_device} | awk '{print $1}' | grep ^${cmd_device}$ >/dev/null 2>&1
+then
+  device=$cmd_device
+else
+  echo "Couldn't match $cmd_device"
+  exit $STATE_UNKNOWN 
+fi
+
+u_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF))}')
+n_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF-1))}')
+
+if [ $uu = $u_status ] && [ $nn = $n_status ]; then
+  echo "OK:  $device $n_status $u_status"
+  exit $STATE_OK
+else
+  echo "FAIL:  $device $n_status $u_status"
+  exit $STATE_CRITICAL
+fi
+
+
+#+END_SRC
+
  *** Configure NAGIOS Monitoring of The Cloister
  
  The abbey adds monitoring for more servers: Dantooine and Kessel.
diff --git a/roles_t/abbey-core/files/check_mdstat b/roles_t/abbey-core/files/check_mdstat

new file mode 100644 (file)

index 0000000..60a9fe6
--- /dev/null
+++ b/roles_t/abbey-core/files/check_mdstat
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+# nagios script checks for failed raid device
+# linux software raid /proc/mdstat
+# karl@webmedianow.com 2013-10-01
+
+STATE_OK=0
+STATE_WARNING=1
+STATE_CRITICAL=2
+STATE_UNKNOWN=3
+STATE_DEPENDENT=4
+
+PATH=/bin:/usr/bin:/sbin:/usr/sbin
+export PATH
+
+usage() {
+cat <<-EOE
+Usage: $0 mdadm_device total_drives
+
+  mdadm_device is md0, md1, etc...
+  total_drives is 2 for mirror, or 3, 4 etc...
+
+Nagios script to check if failed drive in /proc/mdstat
+
+Example: raid 2 (2 disk mirror)
+  /opt/nagios/libexec/check_mdstat.sh md0 2
+
+Example: raid 5 with 8 disks
+  /opt/nagios/libexec/check_mdstat.sh md0 8
+
+EOE
+exit $STATE_UNKNOWN
+}
+
+if [ $# -lt 2 ]; then
+  usage
+fi
+
+cmd_device="$1"
+drive_num="$2"
+
+U=""
+for i in $(seq 1 $drive_num);
+do
+  U="${U}U"
+done
+
+uu="[${U}]"
+nn="[${drive_num}/${drive_num}]"
+
+#cat /proc/mdstat | grep -A 1 ^md1 | tail -1 | awk '{print ($(NF))}'
+# [UUUUUUUU] is OK raid
+# [_U] is Failed Drive
+
+# check if we have correct device...
+if cat /proc/mdstat | grep ^${cmd_device} | awk '{print $1}' | grep ^${cmd_device}$ >/dev/null 2>&1
+then
+  device=$cmd_device
+else
+  echo "Couldn't match $cmd_device"
+  exit $STATE_UNKNOWN 
+fi
+
+u_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF))}')
+n_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF-1))}')
+
+if [ $uu = $u_status ] && [ $nn = $n_status ]; then
+  echo "OK:  $device $n_status $u_status"
+  exit $STATE_OK
+else
+  echo "FAIL:  $device $n_status $u_status"
+  exit $STATE_CRITICAL
+fi
diff --git a/roles_t/abbey-core/tasks/main.yml b/roles_t/abbey-core/tasks/main.yml

index d60f27d68526cdffcf9e068442b4c4e049d090fc..5b602fb2e742bf446dfc838ad27a221c13c6496c 100644 (file)
--- a/roles_t/abbey-core/tasks/main.yml
+++ b/roles_t/abbey-core/tasks/main.yml
@@ -199,9 +199,26 @@
            service_description     Home Partition
            check_command           check_local_disk!20%!10%!/home
        }
+      define service {
+          use                     local-service
+          host_name               core
+          service_description     Home RAID
+          check_command           check_mdstat!md0!3
+      }
+      define command {
+          command_name            check_mdstat
+          command_line            /usr/local/sbin/check_mdstat $ARG1$ $ARG2$
+      }
      dest: /etc/nagios4/conf.d/abbey.cfg
    notify: Reload NAGIOS4.
  
+- name: Install NAGIOS monitor check_mdstat.
+  become: yes
+  copy:
+    src: ../abbey-core/files/check_mdstat
+    dest: /usr/local/sbin/check_mdstat
+    mode: u=rwx,g=rx,o=rx
+
  - name: Configure cloister NAGIOS monitoring.
    become: yes
    template:
author	Matt Birkholz <matt@birchwood-abbey.net>
	Sun, 23 Nov 2025 19:31:46 +0000 (12:31 -0700)
committer	Matt Birkholz <matt@birchwood-abbey.net>
	Sun, 23 Nov 2025 19:36:10 +0000 (12:36 -0700)
README.org		patch \| blob \| history
roles_t/abbey-core/files/check_mdstat	[new file with mode: 0644]	patch \| blob
roles_t/abbey-core/tasks/main.yml		patch \| blob \| history