Move nagios defs to inventory

This commit is contained in:
Salt 2024-07-09 13:28:48 -05:00
parent 320427cba4
commit 9cad3d4867
2 changed files with 145 additions and 143 deletions

View File

@ -166,6 +166,149 @@ secret_nagios_matrix_token: !vault |
6433376138386531380a383762393137613738643538343438633730313135613730613139393536 6433376138386531380a383762393137613738643538343438633730313135613730613139393536
35666133666262383862663637623738643836383633653864626231623034613662646563623936 35666133666262383862663637623738643836383633653864626231623034613662646563623936
3763356331333561383833386162616664376335333139376363 3763356331333561383833386162616664376335333139376363
nagios_contacts:
- name: matrix
host_notification_commands: notify-host-by-matrix
service_notification_commands: notify-service-by-matrix
host_notification_period: ansible-not-late-at-night
service_notification_period: ansible-not-late-at-night
extra:
- key: contactgroups
value: ansible
- name: salt
host_notification_commands: notify-host-by-email
service_notification_commands: notify-service-by-email
extra:
- key: email
value: alerts@babor.tech
nagios_commands:
# This command is included in the container image
- name: check_nrpe
command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$"
- name: check_by_ssh
command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -t 30 -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\""
- name: notify-host-by-matrix
command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\n$HOSTNAME$ is $HOSTSTATE$\\nAddress: $HOSTADDRESS$\\nInfo: $HOSTOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix"
- name: notify-service-by-matrix
command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\nService $HOSTALIAS$ - $SERVICEDESC$ is $SERVICESTATE$\\nInfo: $SERVICEOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix"
nagios_services:
# Agentless checks
- name: HTTP
command: check_http
hostgroup: tag-nagios-checkhttp
- name: HTTPS
command: check_http!--ssl
hostgroup: tag-nagios-checkhttp
- name: SSH
command: check_ssh
# check_by_ssh checks
- name: CPU Utilization
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90
- name: DNS Resolution
command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv
- name: Executables in tmp
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_executables_in_tmp
- name: Last Ansible Play
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 432000 -c 604800
- name: Memory Usage
command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 10% -c 5%
hostgroup: "ansible,!tag-prov-zfs"
- name: Ping Self over DNS
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname
- name: Reboot Required
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required
- name: Unit atd.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service
- name: Unit backup.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service
hostgroup: "ansible,!role-hypervisor"
- name: Unit backup.timer
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer
hostgroup: "ansible,!role-hypervisor"
- name: Unit cron.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service
- name: Unit dbus.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service
- name: Unit ssh.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service
- name: Unit systemd-resolved.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service
hostgroup: "ansible,!role-hypervisor"
- name: Users
command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5
# Privileged checks
# Required because check_disk may attempt to get the free space of
# restricted mountpoints
- name: Disk Usage
command: check_by_ssh!/usr/bin/sudo /usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/' -I '^/tmp/.mount_' -I '^/dev/loop'
# Device type checks
# R720
- name: CPU0 Temperature
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0000
hostgroup: device-type-r720
- name: CPU1 Temperature
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0001
hostgroup: device-type-r720
# Pi 4 4G
- name: CPU Temperature
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor cpu_thermal-virtual-0
hostgroup: device-type-pi4b-2g,device-type-pi4b-4g,device-type-pi4b-4g-storage
# Device role checks
# hypervisor (which is assumed to be Proxmox)
- name: PVE Unit pve-firewall.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service
hostgroup: role-hypervisor
- name: PVE Unit spiceproxy.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service
hostgroup: role-hypervisor
- name: PVE Unit pve-ha-crm.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service
hostgroup: role-hypervisor
- name: PVE Unit pvedaemon.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service
hostgroup: role-hypervisor
- name: PVE Unit pvefw-logger.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service
hostgroup: role-hypervisor
- name: PVE Unit pveproxy.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service
hostgroup: role-hypervisor
- name: PVE Unit pve-cluster.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service
hostgroup: role-hypervisor
- name: PVE Unit pvestatd.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service
hostgroup: role-hypervisor
# Tag-specific checks
# docker
- name: Unit docker.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service
hostgroup: "ansible,!tag-no-docker"
- name: Docker Status
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_docker --no-ok --status running
hostgroup: tag-nagios-checkdocker
# nagios-checkpgsql
- name: PSQL
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5"
hostgroup: tag-nagios-checkpgsql
- name: PSQL Connections
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0"
hostgroup: tag-nagios-checkpgsql
# https://rhaas.blogspot.com/2020/02/useless-vacuuming.html
- name: PSQL Old Xacts
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select count(*)::float from pg_prepared_xacts where age(transaction) > 5000000' -W 500-1000 -C 1000-1000000"
hostgroup: tag-nagios-checkpgsql
- name: Unit postgresql.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service
hostgroup: tag-nagios-checkpgsql
# nagios-checkswap
- name: Swap Usage
command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10%
hostgroup: tag-nagios-checkswap
# zerotier
- name: Unit zerotier-one.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service
hostgroup: tag-zt-personal
# For Netbox # For Netbox
secret_netbox_user_pass: !vault | secret_netbox_user_pass: !vault |

View File

@ -55,154 +55,13 @@
tags: [ web, git ] tags: [ web, git ]
- role: nagios - role: nagios
vars: vars:
# Definitions for contacts and checks are defined in inventory vars
# See group_vars/all.yml if you need to change those
nagios_matrix_server: "https://matrix.desu.ltd" nagios_matrix_server: "https://matrix.desu.ltd"
nagios_matrix_room: "!NWNCKlNmOTcarMcMIh:desu.ltd" nagios_matrix_room: "!NWNCKlNmOTcarMcMIh:desu.ltd"
nagios_matrix_token: "{{ secret_nagios_matrix_token }}" nagios_matrix_token: "{{ secret_nagios_matrix_token }}"
nagios_data_dir: /data/nagios nagios_data_dir: /data/nagios
nagios_admin_pass: "{{ secret_nagios_admin_pass }}" nagios_admin_pass: "{{ secret_nagios_admin_pass }}"
nagios_contacts:
- name: matrix
host_notification_commands: notify-host-by-matrix
service_notification_commands: notify-service-by-matrix
host_notification_period: ansible-not-late-at-night
service_notification_period: ansible-not-late-at-night
extra:
- key: contactgroups
value: ansible
- name: salt
host_notification_commands: notify-host-by-email
service_notification_commands: notify-service-by-email
extra:
- key: email
value: alerts@babor.tech
nagios_commands:
# This command is included in the container image
- name: check_nrpe
command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$"
- name: check_by_ssh
command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -t 30 -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\""
- name: notify-host-by-matrix
command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\n$HOSTNAME$ is $HOSTSTATE$\\nAddress: $HOSTADDRESS$\\nInfo: $HOSTOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix"
- name: notify-service-by-matrix
command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\nService $HOSTALIAS$ - $SERVICEDESC$ is $SERVICESTATE$\\nInfo: $SERVICEOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix"
nagios_services:
# Agentless checks
- name: HTTP
command: check_http
hostgroup: tag-nagios-checkhttp
- name: HTTPS
command: check_http!--ssl
hostgroup: tag-nagios-checkhttp
- name: SSH
command: check_ssh
# check_by_ssh checks
- name: CPU Utilization
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90
- name: DNS Resolution
command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv
- name: Executables in tmp
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_executables_in_tmp
- name: Last Ansible Play
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 432000 -c 604800
- name: Memory Usage
command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 10% -c 5%
hostgroup: "ansible,!tag-prov-zfs"
- name: Ping Self over DNS
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname
- name: Reboot Required
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required
- name: Unit atd.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service
- name: Unit backup.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service
hostgroup: "ansible,!role-hypervisor"
- name: Unit backup.timer
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer
hostgroup: "ansible,!role-hypervisor"
- name: Unit cron.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service
- name: Unit dbus.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service
- name: Unit ssh.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service
- name: Unit systemd-resolved.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service
hostgroup: "ansible,!role-hypervisor"
- name: Users
command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5
# Privileged checks
# Required because check_disk may attempt to get the free space of
# restricted mountpoints
- name: Disk Usage
command: check_by_ssh!/usr/bin/sudo /usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/' -I '^/tmp/.mount_' -I '^/dev/loop'
# Device type checks
# R720
- name: CPU0 Temperature
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0000
hostgroup: device-type-r720
- name: CPU1 Temperature
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0001
hostgroup: device-type-r720
# Pi 4 4G
- name: CPU Temperature
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor cpu_thermal-virtual-0
hostgroup: device-type-pi4b-2g,device-type-pi4b-4g,device-type-pi4b-4g-storage
# Device role checks
# hypervisor (which is assumed to be Proxmox)
- name: PVE Unit pve-firewall.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service
hostgroup: role-hypervisor
- name: PVE Unit spiceproxy.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service
hostgroup: role-hypervisor
- name: PVE Unit pve-ha-crm.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service
hostgroup: role-hypervisor
- name: PVE Unit pvedaemon.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service
hostgroup: role-hypervisor
- name: PVE Unit pvefw-logger.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service
hostgroup: role-hypervisor
- name: PVE Unit pveproxy.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service
hostgroup: role-hypervisor
- name: PVE Unit pve-cluster.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service
hostgroup: role-hypervisor
- name: PVE Unit pvestatd.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service
hostgroup: role-hypervisor
# Tag-specific checks
# docker
- name: Unit docker.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service
hostgroup: "ansible,!tag-no-docker"
- name: Docker Status
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_docker --no-ok --status running
hostgroup: tag-nagios-checkdocker
# nagios-checkpgsql
- name: PSQL
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5"
hostgroup: tag-nagios-checkpgsql
- name: PSQL Connections
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0"
hostgroup: tag-nagios-checkpgsql
# https://rhaas.blogspot.com/2020/02/useless-vacuuming.html
- name: PSQL Old Xacts
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select count(*)::float from pg_prepared_xacts where age(transaction) > 5000000' -W 500-1000 -C 1000-1000000"
hostgroup: tag-nagios-checkpgsql
- name: Unit postgresql.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service
hostgroup: tag-nagios-checkpgsql
# nagios-checkswap
- name: Swap Usage
command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10%
hostgroup: tag-nagios-checkswap
# zerotier
- name: Unit zerotier-one.service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service
hostgroup: tag-zt-personal
tags: [ nagios, no-auto ] tags: [ nagios, no-auto ]
- role: ingress - role: ingress
vars: vars: