diff --git a/inventories/production/group_vars/all.yml b/inventories/production/group_vars/all.yml index 86f3198..2d7f41b 100644 --- a/inventories/production/group_vars/all.yml +++ b/inventories/production/group_vars/all.yml @@ -166,6 +166,149 @@ secret_nagios_matrix_token: !vault | 6433376138386531380a383762393137613738643538343438633730313135613730613139393536 35666133666262383862663637623738643836383633653864626231623034613662646563623936 3763356331333561383833386162616664376335333139376363 +nagios_contacts: + - name: matrix + host_notification_commands: notify-host-by-matrix + service_notification_commands: notify-service-by-matrix + host_notification_period: ansible-not-late-at-night + service_notification_period: ansible-not-late-at-night + extra: + - key: contactgroups + value: ansible + - name: salt + host_notification_commands: notify-host-by-email + service_notification_commands: notify-service-by-email + extra: + - key: email + value: alerts@babor.tech +nagios_commands: + # This command is included in the container image + - name: check_nrpe + command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$" + - name: check_by_ssh + command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -t 30 -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\"" + - name: notify-host-by-matrix + command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\n$HOSTNAME$ is $HOSTSTATE$\\nAddress: $HOSTADDRESS$\\nInfo: $HOSTOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix" + - name: notify-service-by-matrix + command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\nService $HOSTALIAS$ - $SERVICEDESC$ is $SERVICESTATE$\\nInfo: $SERVICEOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix" +nagios_services: + # Agentless checks + - name: HTTP + command: check_http + hostgroup: tag-nagios-checkhttp + - name: HTTPS + command: check_http!--ssl + hostgroup: tag-nagios-checkhttp + - name: SSH + command: check_ssh + # check_by_ssh checks + - name: CPU Utilization + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90 + - name: DNS Resolution + command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv + - name: Executables in tmp + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_executables_in_tmp + - name: Last Ansible Play + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 432000 -c 604800 + - name: Memory Usage + command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 10% -c 5% + hostgroup: "ansible,!tag-prov-zfs" + - name: Ping Self over DNS + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname + - name: Reboot Required + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required + - name: Unit atd.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service + - name: Unit backup.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service + hostgroup: "ansible,!role-hypervisor" + - name: Unit backup.timer + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer + hostgroup: "ansible,!role-hypervisor" + - name: Unit cron.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service + - name: Unit dbus.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service + - name: Unit ssh.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service + - name: Unit systemd-resolved.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service + hostgroup: "ansible,!role-hypervisor" + - name: Users + command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5 + # Privileged checks + # Required because check_disk may attempt to get the free space of + # restricted mountpoints + - name: Disk Usage + command: check_by_ssh!/usr/bin/sudo /usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/' -I '^/tmp/.mount_' -I '^/dev/loop' + # Device type checks + # R720 + - name: CPU0 Temperature + command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0000 + hostgroup: device-type-r720 + - name: CPU1 Temperature + command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0001 + hostgroup: device-type-r720 + # Pi 4 4G + - name: CPU Temperature + command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor cpu_thermal-virtual-0 + hostgroup: device-type-pi4b-2g,device-type-pi4b-4g,device-type-pi4b-4g-storage + # Device role checks + # hypervisor (which is assumed to be Proxmox) + - name: PVE Unit pve-firewall.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service + hostgroup: role-hypervisor + - name: PVE Unit spiceproxy.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service + hostgroup: role-hypervisor + - name: PVE Unit pve-ha-crm.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service + hostgroup: role-hypervisor + - name: PVE Unit pvedaemon.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service + hostgroup: role-hypervisor + - name: PVE Unit pvefw-logger.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service + hostgroup: role-hypervisor + - name: PVE Unit pveproxy.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service + hostgroup: role-hypervisor + - name: PVE Unit pve-cluster.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service + hostgroup: role-hypervisor + - name: PVE Unit pvestatd.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service + hostgroup: role-hypervisor + # Tag-specific checks + # docker + - name: Unit docker.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service + hostgroup: "ansible,!tag-no-docker" + - name: Docker Status + command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_docker --no-ok --status running + hostgroup: tag-nagios-checkdocker + # nagios-checkpgsql + - name: PSQL + command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5" + hostgroup: tag-nagios-checkpgsql + - name: PSQL Connections + command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0" + hostgroup: tag-nagios-checkpgsql + # https://rhaas.blogspot.com/2020/02/useless-vacuuming.html + - name: PSQL Old Xacts + command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select count(*)::float from pg_prepared_xacts where age(transaction) > 5000000' -W 500-1000 -C 1000-1000000" + hostgroup: tag-nagios-checkpgsql + - name: Unit postgresql.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service + hostgroup: tag-nagios-checkpgsql + # nagios-checkswap + - name: Swap Usage + command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10% + hostgroup: tag-nagios-checkswap + # zerotier + - name: Unit zerotier-one.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service + hostgroup: tag-zt-personal # For Netbox secret_netbox_user_pass: !vault | diff --git a/playbooks/prod_web.yml b/playbooks/prod_web.yml index 654d67d..4d2ad31 100755 --- a/playbooks/prod_web.yml +++ b/playbooks/prod_web.yml @@ -55,154 +55,13 @@ tags: [ web, git ] - role: nagios vars: + # Definitions for contacts and checks are defined in inventory vars + # See group_vars/all.yml if you need to change those nagios_matrix_server: "https://matrix.desu.ltd" nagios_matrix_room: "!NWNCKlNmOTcarMcMIh:desu.ltd" nagios_matrix_token: "{{ secret_nagios_matrix_token }}" nagios_data_dir: /data/nagios nagios_admin_pass: "{{ secret_nagios_admin_pass }}" - nagios_contacts: - - name: matrix - host_notification_commands: notify-host-by-matrix - service_notification_commands: notify-service-by-matrix - host_notification_period: ansible-not-late-at-night - service_notification_period: ansible-not-late-at-night - extra: - - key: contactgroups - value: ansible - - name: salt - host_notification_commands: notify-host-by-email - service_notification_commands: notify-service-by-email - extra: - - key: email - value: alerts@babor.tech - nagios_commands: - # This command is included in the container image - - name: check_nrpe - command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$" - - name: check_by_ssh - command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -t 30 -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\"" - - name: notify-host-by-matrix - command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\n$HOSTNAME$ is $HOSTSTATE$\\nAddress: $HOSTADDRESS$\\nInfo: $HOSTOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix" - - name: notify-service-by-matrix - command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\nService $HOSTALIAS$ - $SERVICEDESC$ is $SERVICESTATE$\\nInfo: $SERVICEOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix" - nagios_services: - # Agentless checks - - name: HTTP - command: check_http - hostgroup: tag-nagios-checkhttp - - name: HTTPS - command: check_http!--ssl - hostgroup: tag-nagios-checkhttp - - name: SSH - command: check_ssh - # check_by_ssh checks - - name: CPU Utilization - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90 - - name: DNS Resolution - command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv - - name: Executables in tmp - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_executables_in_tmp - - name: Last Ansible Play - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 432000 -c 604800 - - name: Memory Usage - command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 10% -c 5% - hostgroup: "ansible,!tag-prov-zfs" - - name: Ping Self over DNS - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname - - name: Reboot Required - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required - - name: Unit atd.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service - - name: Unit backup.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service - hostgroup: "ansible,!role-hypervisor" - - name: Unit backup.timer - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer - hostgroup: "ansible,!role-hypervisor" - - name: Unit cron.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service - - name: Unit dbus.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service - - name: Unit ssh.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service - - name: Unit systemd-resolved.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service - hostgroup: "ansible,!role-hypervisor" - - name: Users - command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5 - # Privileged checks - # Required because check_disk may attempt to get the free space of - # restricted mountpoints - - name: Disk Usage - command: check_by_ssh!/usr/bin/sudo /usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/' -I '^/tmp/.mount_' -I '^/dev/loop' - # Device type checks - # R720 - - name: CPU0 Temperature - command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0000 - hostgroup: device-type-r720 - - name: CPU1 Temperature - command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0001 - hostgroup: device-type-r720 - # Pi 4 4G - - name: CPU Temperature - command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor cpu_thermal-virtual-0 - hostgroup: device-type-pi4b-2g,device-type-pi4b-4g,device-type-pi4b-4g-storage - # Device role checks - # hypervisor (which is assumed to be Proxmox) - - name: PVE Unit pve-firewall.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service - hostgroup: role-hypervisor - - name: PVE Unit spiceproxy.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service - hostgroup: role-hypervisor - - name: PVE Unit pve-ha-crm.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service - hostgroup: role-hypervisor - - name: PVE Unit pvedaemon.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service - hostgroup: role-hypervisor - - name: PVE Unit pvefw-logger.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service - hostgroup: role-hypervisor - - name: PVE Unit pveproxy.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service - hostgroup: role-hypervisor - - name: PVE Unit pve-cluster.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service - hostgroup: role-hypervisor - - name: PVE Unit pvestatd.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service - hostgroup: role-hypervisor - # Tag-specific checks - # docker - - name: Unit docker.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service - hostgroup: "ansible,!tag-no-docker" - - name: Docker Status - command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_docker --no-ok --status running - hostgroup: tag-nagios-checkdocker - # nagios-checkpgsql - - name: PSQL - command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5" - hostgroup: tag-nagios-checkpgsql - - name: PSQL Connections - command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0" - hostgroup: tag-nagios-checkpgsql - # https://rhaas.blogspot.com/2020/02/useless-vacuuming.html - - name: PSQL Old Xacts - command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select count(*)::float from pg_prepared_xacts where age(transaction) > 5000000' -W 500-1000 -C 1000-1000000" - hostgroup: tag-nagios-checkpgsql - - name: Unit postgresql.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service - hostgroup: tag-nagios-checkpgsql - # nagios-checkswap - - name: Swap Usage - command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10% - hostgroup: tag-nagios-checkswap - # zerotier - - name: Unit zerotier-one.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service - hostgroup: tag-zt-personal tags: [ nagios, no-auto ] - role: ingress vars: