~monitoring-plugins~ package. Thus a NAGIOS4 server on the abbey's
Core monitors core network services, and uses ~nagios-nrpe-server~ to
monitor Gate. The abbey adds several more monitors, installing
-additional configuration files in =/etc/nagios4/conf.d/=, and another
-customized ~check_sensors~ plugin (~abbey_pisensors~) in
-=/usr/local/sbin/= on the Raspberry Pis.
+additional configuration files in =/etc/nagios4/conf.d/=, a
+~check_mdstat~ plugin from ~https://exchange.nagios.org/~ on Core, and
+another customized ~check_sensors~ plugin (~abbey_pisensors~) on the
+Raspberry Pis.
*** Monitoring The Home Disk
The abbey adds monitoring of the space remaining on the volume at
=/home/= on Core. (The small institute only monitors the space
-remaining on roots.)
+remaining on roots.) The abbey also monitors of the state of the
+RAID-5 array under =/home/=.
#+CAPTION: [[file:roles_t/abbey-core/tasks/main.yml][=roles_t/abbey-core/tasks/main.yml=]]
#+BEGIN_SRC conf :tangle roles_t/abbey-core/tasks/main.yml
service_description Home Partition
check_command check_local_disk!20%!10%!/home
}
+ define service {
+ use local-service
+ host_name core
+ service_description Home RAID
+ check_command check_mdstat!md0!3
+ }
+ define command {
+ command_name check_mdstat
+ command_line /usr/local/sbin/check_mdstat $ARG1$ $ARG2$
+ }
dest: /etc/nagios4/conf.d/abbey.cfg
notify: Reload NAGIOS4.
+
+- name: Install NAGIOS monitor check_mdstat.
+ become: yes
+ copy:
+ src: ../abbey-core/files/check_mdstat
+ dest: /usr/local/sbin/check_mdstat
+ mode: u=rwx,g=rx,o=rx
#+END_SRC
#+CAPTION: [[file:roles_t/abbey-core/handlers/main.yml][=roles_t/abbey-core/handlers/main.yml=]]
esac
#+END_SRC
+*** Stolen NAGIOS Monitor ~check_mdstat~
+
+This ~check_mdstat~ plugin was copied from the NAGIOS Exchange ([[https://exchange.nagios.org/directory/plugins/operating-systems/linux/check_mdstat/details/][here]]).
+It detects a failing disk in a multi-disk array.
+
+#+CAPTION: [[file:roles_t/abbey-core/files/check_mdstat][=roles_t/abbey-core/files/check_mdstat=]]
+#+BEGIN_SRC sh :tangle roles_t/abbey-core/files/check_mdstat
+#!/usr/bin/env bash
+
+# nagios script checks for failed raid device
+# linux software raid /proc/mdstat
+# karl@webmedianow.com 2013-10-01
+
+STATE_OK=0
+STATE_WARNING=1
+STATE_CRITICAL=2
+STATE_UNKNOWN=3
+STATE_DEPENDENT=4
+
+PATH=/bin:/usr/bin:/sbin:/usr/sbin
+export PATH
+
+usage() {
+cat <<-EOE
+Usage: $0 mdadm_device total_drives
+
+ mdadm_device is md0, md1, etc...
+ total_drives is 2 for mirror, or 3, 4 etc...
+
+Nagios script to check if failed drive in /proc/mdstat
+
+Example: raid 2 (2 disk mirror)
+ /opt/nagios/libexec/check_mdstat.sh md0 2
+
+Example: raid 5 with 8 disks
+ /opt/nagios/libexec/check_mdstat.sh md0 8
+
+EOE
+exit $STATE_UNKNOWN
+}
+
+if [ $# -lt 2 ]; then
+ usage
+fi
+
+cmd_device="$1"
+drive_num="$2"
+
+U=""
+for i in $(seq 1 $drive_num);
+do
+ U="${U}U"
+done
+
+uu="[${U}]"
+nn="[${drive_num}/${drive_num}]"
+
+#cat /proc/mdstat | grep -A 1 ^md1 | tail -1 | awk '{print ($(NF))}'
+# [UUUUUUUU] is OK raid
+# [_U] is Failed Drive
+
+# check if we have correct device...
+if cat /proc/mdstat | grep ^${cmd_device} | awk '{print $1}' | grep ^${cmd_device}$ >/dev/null 2>&1
+then
+ device=$cmd_device
+else
+ echo "Couldn't match $cmd_device"
+ exit $STATE_UNKNOWN
+fi
+
+u_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF))}')
+n_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF-1))}')
+
+if [ $uu = $u_status ] && [ $nn = $n_status ]; then
+ echo "OK: $device $n_status $u_status"
+ exit $STATE_OK
+else
+ echo "FAIL: $device $n_status $u_status"
+ exit $STATE_CRITICAL
+fi
+
+
+#+END_SRC
+
*** Configure NAGIOS Monitoring of The Cloister
The abbey adds monitoring for more servers: Dantooine and Kessel.
--- /dev/null
+#!/usr/bin/env bash
+
+# nagios script checks for failed raid device
+# linux software raid /proc/mdstat
+# karl@webmedianow.com 2013-10-01
+
+STATE_OK=0
+STATE_WARNING=1
+STATE_CRITICAL=2
+STATE_UNKNOWN=3
+STATE_DEPENDENT=4
+
+PATH=/bin:/usr/bin:/sbin:/usr/sbin
+export PATH
+
+usage() {
+cat <<-EOE
+Usage: $0 mdadm_device total_drives
+
+ mdadm_device is md0, md1, etc...
+ total_drives is 2 for mirror, or 3, 4 etc...
+
+Nagios script to check if failed drive in /proc/mdstat
+
+Example: raid 2 (2 disk mirror)
+ /opt/nagios/libexec/check_mdstat.sh md0 2
+
+Example: raid 5 with 8 disks
+ /opt/nagios/libexec/check_mdstat.sh md0 8
+
+EOE
+exit $STATE_UNKNOWN
+}
+
+if [ $# -lt 2 ]; then
+ usage
+fi
+
+cmd_device="$1"
+drive_num="$2"
+
+U=""
+for i in $(seq 1 $drive_num);
+do
+ U="${U}U"
+done
+
+uu="[${U}]"
+nn="[${drive_num}/${drive_num}]"
+
+#cat /proc/mdstat | grep -A 1 ^md1 | tail -1 | awk '{print ($(NF))}'
+# [UUUUUUUU] is OK raid
+# [_U] is Failed Drive
+
+# check if we have correct device...
+if cat /proc/mdstat | grep ^${cmd_device} | awk '{print $1}' | grep ^${cmd_device}$ >/dev/null 2>&1
+then
+ device=$cmd_device
+else
+ echo "Couldn't match $cmd_device"
+ exit $STATE_UNKNOWN
+fi
+
+u_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF))}')
+n_status=$(cat /proc/mdstat | grep -A 1 ^${device} | tail -1 | awk '{print ($(NF-1))}')
+
+if [ $uu = $u_status ] && [ $nn = $n_status ]; then
+ echo "OK: $device $n_status $u_status"
+ exit $STATE_OK
+else
+ echo "FAIL: $device $n_status $u_status"
+ exit $STATE_CRITICAL
+fi