diff --git a/inventories/production/group_vars/all.yml b/inventories/production/group_vars/all.yml index 9b2f7dd..6807882 100644 --- a/inventories/production/group_vars/all.yml +++ b/inventories/production/group_vars/all.yml @@ -202,119 +202,20 @@ nagios_commands: - name: notify-service-by-matrix command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\nService $HOSTALIAS$ - $SERVICEDESC$ is $SERVICESTATE$\\nInfo: $SERVICEOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix" nagios_services: - # Agentless checks - - name: HTTP - command: check_http - hostgroup: tag-nagios-checkhttp - - name: HTTPS - command: check_http!--ssl - hostgroup: tag-nagios-checkhttp - name: SSH command: check_ssh # check_by_ssh checks - - name: CPU Utilization - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90 - - name: DNS Resolution - command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv - - name: Executables in tmp - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_executables_in_tmp - name: Last Ansible Play command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 432000 -c 604800 - - name: Memory Usage - command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 10% -c 5% - hostgroup: "ansible,!tag-prov-zfs" - - name: Ping Self over DNS - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname - name: Reboot Required command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required - - name: Unit atd.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service - name: Unit backup.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service hostgroup: "ansible,!role-hypervisor" - name: Unit backup.timer command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer hostgroup: "ansible,!role-hypervisor" - - name: Unit cron.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service - - name: Unit dbus.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service - - name: Unit ssh.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service - - name: Unit systemd-resolved.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service - hostgroup: "ansible,!role-hypervisor" - - name: Users - command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5 - # Privileged checks - # Required because check_disk may attempt to get the free space of - # restricted mountpoints - - name: Disk Usage - command: check_by_ssh!/usr/bin/sudo /usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/' -I '^/tmp/.mount_' -I '^/dev/loop' - # Device type checks - # R720 - - name: CPU0 Temperature - command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0000 - hostgroup: device-type-r720 - - name: CPU1 Temperature - command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0001 - hostgroup: device-type-r720 - # Pi 4 4G - - name: CPU Temperature - command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor cpu_thermal-virtual-0 - hostgroup: device-type-pi4b-2g,device-type-pi4b-4g,device-type-pi4b-4g-storage - # Device role checks - # hypervisor (which is assumed to be Proxmox) - - name: PVE Unit pve-firewall.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service - hostgroup: role-hypervisor - - name: PVE Unit spiceproxy.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service - hostgroup: role-hypervisor - - name: PVE Unit pve-ha-crm.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service - hostgroup: role-hypervisor - - name: PVE Unit pvedaemon.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service - hostgroup: role-hypervisor - - name: PVE Unit pvefw-logger.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service - hostgroup: role-hypervisor - - name: PVE Unit pveproxy.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service - hostgroup: role-hypervisor - - name: PVE Unit pve-cluster.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service - hostgroup: role-hypervisor - - name: PVE Unit pvestatd.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service - hostgroup: role-hypervisor # Tag-specific checks - # docker - - name: Unit docker.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service - hostgroup: "ansible,!tag-no-docker" - - name: Docker Status - command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_docker --no-ok --status running - hostgroup: tag-nagios-checkdocker - # nagios-checkpgsql - - name: PSQL - command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5" - hostgroup: tag-nagios-checkpgsql - - name: PSQL Connections - command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0" - hostgroup: tag-nagios-checkpgsql - # https://rhaas.blogspot.com/2020/02/useless-vacuuming.html - - name: PSQL Old Xacts - command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select count(*)::float from pg_prepared_xacts where age(transaction) > 5000000' -W 500-1000 -C 1000-1000000" - hostgroup: tag-nagios-checkpgsql - - name: Unit postgresql.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service - hostgroup: tag-nagios-checkpgsql - # nagios-checkswap - - name: Swap Usage - command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10% - hostgroup: tag-nagios-checkswap # zerotier - name: Unit zerotier-one.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service diff --git a/roles/nagios/templates/nagios-ansible-inventory.cfg.j2 b/roles/nagios/templates/nagios-ansible-inventory.cfg.j2 index 52bc7bd..62e7317 100644 --- a/roles/nagios/templates/nagios-ansible-inventory.cfg.j2 +++ b/roles/nagios/templates/nagios-ansible-inventory.cfg.j2 @@ -149,70 +149,5 @@ define host { contact_groups ansible } -{% for service in vars.services %} -{% for tag in service.tags %} -{# #} -{% if tag.slug == "nagios-checkmatrix" %} -{% for port in service.ports %} -define service { - use ansible-generic-service - service_description Matrix Synapse - {{ service.name }} - {{ port }} - check_command check_http!--ssl -H {{ service.name }} -u https://{{ service.name }}/health -s OK -p {{ port }} -f sticky - host_name {{ host }} - contact_groups ansible -} -{% endfor %} -{% endif %} -{# #} -{% if tag.slug == "nagios-checkminecraft" %} -{% for port in service.ports %} -define service { - use ansible-generic-service - service_description Minecraft - {{ service.name }} - {{ port }} - check_command check_by_ssh!/usr/local/bin/monitoring-scripts/check_minecraft -H {{ host }} -p {{ port }} -m "{{ service.description }}" -f -w 3 -c 5 - host_name {{ host }} - contact_groups ansible -} -{% endfor %} -{% endif %} -{# #} -{% if tag.slug == "nagios-checkhttp" %} -{% for port in service.ports %} -define service { - use ansible-generic-service - service_description HTTP - {{ service.name }} - {{ port }} - check_command check_http!-H {{ service.name }} -p {{ port }} -f sticky - host_name {{ host }} - contact_groups ansible -} -{% endfor %} -{% endif %} -{# #} -{% if tag.slug == "nagios-checkhttps" %} -{% for port in service.ports %} -define service { - use ansible-generic-service - service_description HTTPS - {{ service.name }} - {{ port }} - check_command check_http!--ssl -H {{ service.name }} -p {{ port }} -f sticky - host_name {{ host }} - contact_groups ansible -} -{% endfor %} -{% endif %} -{# #} -{% if tag.slug == "nagios-checktcp" %} -{% for port in service.ports %} -define service { - use ansible-generic-service - service_description TCP {{ service.name }} - {{ port }} - check_command check_tcp!{{ port }} - host_name {{ host }} - contact_groups ansible -} -{% endfor %} -{% endif %} -{# #} -{% endfor %} -{% endfor %} {% endif %} {% endfor %} diff --git a/roles/nagios/templates/nagios-ansible.cfg.j2 b/roles/nagios/templates/nagios-ansible.cfg.j2 index 39275ee..f485294 100644 --- a/roles/nagios/templates/nagios-ansible.cfg.j2 +++ b/roles/nagios/templates/nagios-ansible.cfg.j2 @@ -182,55 +182,6 @@ define host { # Created: {{ service.value.created }} # Updated: {{ service.value.last_updated }} {% for tag in service.value.tags %} -{# #} -{% if tag.slug == "nagios-checkminecraft" %} -{% for port in service.value.ports %} -define service { - use ansible-generic-service - service_description Minecraft - {{ service.value.name }} - {{ port }} - check_command check_by_ssh!/usr/local/bin/monitoring-scripts/check_minecraft -H {{ host_name }} -p {{ port }} -m "{{ service.value.description }}" -f -w 3 -c 5 - host_name {{ host_name }} - contact_groups ansible -} -{% endfor %} -{% endif %} -{# #} -{% if tag.slug == "nagios-checkhttp" %} -{% for port in service.value.ports %} -define service { - use ansible-generic-service - service_description HTTP - {{ service.value.name }} - {{ port }} - check_command check_http!-H {{ service.value.name }} -p {{ port }} -f sticky - host_name {{ host_name }} - contact_groups ansible -} -{% endfor %} -{% endif %} -{# #} -{% if tag.slug == "nagios-checkhttps" %} -{% for port in service.value.ports %} -define service { - use ansible-generic-service - service_description HTTPS - {{ service.value.name }} - {{ port }} - check_command check_http!--ssl -H {{ service.value.name }} -p {{ port }} -f sticky - host_name {{ host_name }} - contact_groups ansible -} -{% endfor %} -{% endif %} -{# #} -{% if tag.slug == "nagios-checktcp" %} -{% for port in service.value.ports %} -define service { - use ansible-generic-service - service_description TCP {{ service.value.name }} - {{ port }} - check_command check_tcp!{{ port }} - host_name {{ host_name }} - contact_groups ansible -} -{% endfor %} -{% endif %} -{# #} {% endfor %} {% endif %} {% endfor %}