From fcffd834a0e2c5fc61b2252a7fe662f2f4ea1a1e Mon Sep 17 00:00:00 2001 From: Salt Date: Wed, 8 Dec 2021 21:34:32 -0600 Subject: [PATCH] Move Nagios into its own role It was getting way too big --- playbooks/prod_web.yml | 130 +++++++++++++- playbooks/tasks/web/nagios.yml | 158 ------------------ roles/nagios/defaults/main.yml | 27 +++ roles/nagios/handlers/main.yml | 4 + roles/nagios/tasks/main.yml | 35 ++++ .../nagios/templates}/nagios-ansible.cfg.j2 | 0 6 files changed, 194 insertions(+), 160 deletions(-) delete mode 100644 playbooks/tasks/web/nagios.yml create mode 100644 roles/nagios/defaults/main.yml create mode 100644 roles/nagios/handlers/main.yml create mode 100644 roles/nagios/tasks/main.yml rename {playbooks/tasks/web => roles/nagios/templates}/nagios-ansible.cfg.j2 (100%) diff --git a/playbooks/prod_web.yml b/playbooks/prod_web.yml index 65245e2..4ac2a57 100755 --- a/playbooks/prod_web.yml +++ b/playbooks/prod_web.yml @@ -130,8 +130,6 @@ - app/redis.yml - web/movienight.yml - web/netbox.yml - # TODO: Replace this with Naemon(?) - - web/nagios.yml tags: [ always ] roles: - role: backup @@ -139,6 +137,134 @@ backup_s3backup_list_extra: - /data tags: [ backup ] + # TODO: Replace this with Naemon(?) + - role: nagios + vars: + nagios_data_dir: /data/nagios + nagios_admin_pass: "{{ secret_nagios_admin_pass }}" + nagios_contacts: + - name: salt + host_notification_commands: notify-host-by-email + service_notification_commands: notify-service-by-email + extra: + - key: email + value: rehashedsalt@cock.li + nagios_commands: + # This command is included in the container image + - name: check_nrpe + command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$" + - name: check_by_ssh + command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -t 30 -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\"" + nagios_services: + # Agentless checks + - name: HTTP + command: check_http + hostgroup: tag-nagios-checkhttp + - name: HTTPS + command: check_http!--ssl + hostgroup: tag-nagios-checkhttp + - name: SSH + command: check_ssh + # check_by_ssh checks + - name: CPU Load + command: check_by_ssh!/usr/lib/nagios/plugins/check_load -r -w 5,4,3 -c 7,6,5 + - name: CPU Utilization + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90 + - name: Disk Usage + command: check_by_ssh!/usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/' + - name: DNS Resolution + command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv + - name: Last Ansible Play + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 93600 -c 129600 + - name: Memory Usage + command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 20% -c 10% + - name: Package Updates + command: check_by_ssh!/usr/lib/nagios/plugins/check_packages + extra: + - key: notification_options + value: c,r + - name: Ping Self over DNS + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname + - name: Reboot Required + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required + - name: Unit atd.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service + - name: Unit backup.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service + hostgroup: "!role-hypervisor" + - name: Unit backup.timer + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer + hostgroup: "!role-hypervisor" + - name: Unit cron.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service + - name: Unit dbus.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service + - name: Unit docker.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service + hostgroup: "!tag-no-docker" + - name: Unit ssh.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service + - name: Unit systemd-resolved.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service + hostgroup: "!role-hypervisor" + - name: Users + command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5 + # Device role checks + # hypervisor (which is assumed to be Proxmox) + - name: PVE Unit pve-firewall.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service + hostgroup: role-hypervisor + - name: PVE Unit spiceproxy.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service + hostgroup: role-hypervisor + - name: PVE Unit pve-ha-crm.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service + hostgroup: role-hypervisor + - name: PVE Unit pvedaemon.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service + hostgroup: role-hypervisor + - name: PVE Unit pvefw-logger.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service + hostgroup: role-hypervisor + - name: PVE Unit pveproxy.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service + hostgroup: role-hypervisor + - name: PVE Unit pve-cluster.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service + hostgroup: role-hypervisor + - name: PVE Unit pvestatd.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service + hostgroup: role-hypervisor + # Tag-specific checks + # ansible-pull + - name: Unit ansible-pull.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.service + hostgroup: tag-ansible-pull + - name: Unit ansible-pull.timer + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.timer + hostgroup: tag-ansible-pull + # docker + # Strictly speaking not a tag, but it's best to keep it separated + # TODO: Figure out how I'm going to implement Docker checks + # nagios-checkpgsql + - name: PSQL + command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5" + hostgroup: tag-nagios-checkpgsql + - name: PSQL Connections + command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0" + hostgroup: tag-nagios-checkpgsql + - name: Unit postgresql.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service + hostgroup: tag-nagios-checkpgsql + # nagios-checkswap + - name: Swap Usage + command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10% + hostgroup: tag-nagios-checkswap + # zerotier + - name: Unit zerotier-one.service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service + hostgroup: tag-zerotier + tags: [ nagios ] - role: ingress vars: ingress_servers: diff --git a/playbooks/tasks/web/nagios.yml b/playbooks/tasks/web/nagios.yml deleted file mode 100644 index 5893861..0000000 --- a/playbooks/tasks/web/nagios.yml +++ /dev/null @@ -1,158 +0,0 @@ -# vim:ft=ansible: -- name: assure data directory for nagios - file: path=/data/nagios state=directory mode=0755 - tags: [ nagios ] -- name: template out config for nagios - template: src=nagios-ansible.cfg.j2 dest=/data/nagios/etc/objects/ansible.cfg owner=root group=root mode=0644 - vars: - nagios_contacts: - - name: salt - host_notification_commands: notify-host-by-email - service_notification_commands: notify-service-by-email - extra: - - key: email - value: rehashedsalt@cock.li - nagios_commands: - # This command is included in the container image - - name: check_nrpe - command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$" - - name: check_by_ssh - command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -t 30 -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\"" - nagios_services: - # Agentless checks - - name: HTTP - command: check_http - hostgroup: tag-nagios-checkhttp - - name: HTTPS - command: check_http!--ssl - hostgroup: tag-nagios-checkhttp - - name: SSH - command: check_ssh - # check_by_ssh checks - - name: CPU Load - command: check_by_ssh!/usr/lib/nagios/plugins/check_load -r -w 5,4,3 -c 7,6,5 - - name: CPU Utilization - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90 - - name: Disk Usage - command: check_by_ssh!/usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/' - - name: DNS Resolution - command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv - - name: Last Ansible Play - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 93600 -c 129600 - - name: Memory Usage - command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 20% -c 10% - - name: Package Updates - command: check_by_ssh!/usr/lib/nagios/plugins/check_packages - extra: - - key: notification_options - value: c,r - - name: Ping Self over DNS - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname - - name: Reboot Required - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required - - name: Unit atd.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service - - name: Unit backup.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service - hostgroup: "!role-hypervisor" - - name: Unit backup.timer - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer - hostgroup: "!role-hypervisor" - - name: Unit cron.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service - - name: Unit dbus.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service - - name: Unit docker.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service - hostgroup: "!tag-no-docker" - - name: Unit ssh.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service - - name: Unit systemd-resolved.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service - hostgroup: "!role-hypervisor" - - name: Users - command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5 - # Device role checks - # hypervisor (which is assumed to be Proxmox) - - name: PVE Unit pve-firewall.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service - hostgroup: role-hypervisor - - name: PVE Unit spiceproxy.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service - hostgroup: role-hypervisor - - name: PVE Unit pve-ha-crm.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service - hostgroup: role-hypervisor - - name: PVE Unit pvedaemon.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service - hostgroup: role-hypervisor - - name: PVE Unit pvefw-logger.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service - hostgroup: role-hypervisor - - name: PVE Unit pveproxy.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service - hostgroup: role-hypervisor - - name: PVE Unit pve-cluster.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service - hostgroup: role-hypervisor - - name: PVE Unit pvestatd.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service - hostgroup: role-hypervisor - # Tag-specific checks - # ansible-pull - - name: Unit ansible-pull.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.service - hostgroup: tag-ansible-pull - - name: Unit ansible-pull.timer - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.timer - hostgroup: tag-ansible-pull - # docker - # Strictly speaking not a tag, but it's best to keep it separated - # TODO: Figure out how I'm going to implement Docker checks - # nagios-checkpgsql - - name: PSQL - command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5" - hostgroup: tag-nagios-checkpgsql - - name: PSQL Connections - command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0" - hostgroup: tag-nagios-checkpgsql - - name: Unit postgresql.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service - hostgroup: tag-nagios-checkpgsql - # nagios-checkswap - - name: Swap Usage - command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10% - hostgroup: tag-nagios-checkswap - # zerotier - - name: Unit zerotier-one.service - command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service - hostgroup: tag-zerotier - register: config - tags: [ nagios, template ] -- name: assure config file is loaded - lineinfile: path=/data/nagios/etc/nagios.cfg line='cfg_file=/opt/nagios/etc/objects/ansible.cfg' - tags: [ nagios, template ] -- name: docker deploy nagios - docker_container: - name: nagios - image: jasonrivers/nagios - env: - NAGIOSADMIN_USER: admin - NAGIOSADMIN_PASS: "{{ secret_nagios_admin_pass }}" - NAGIOS_TIMEZONE: "America/Chicago" - networks: - - name: web - aliases: [ "nagios" ] - volumes: - - /data/nagios/etc:/opt/nagios/etc - - /data/nagios/var:/opt/nagios/var - - /data/nagios/plugins:/opt/Custom-Nagios-Plugins - - /data/nagios/nagiosgraph/var:/opt/nagiosgraph/var - - /data/nagios/nagiosgraph/etc:/opt/nagiosgraph/etc - - /dev/null:/opt/nagios/bin/nsca - - /dev/null:/opt/nagios/bin/send_nsca - tags: [ docker, nagios ] -- name: restart nagios - docker_container: name=nagios state=started restart=yes - when: config and config is changed - tags: [ docker, nagios ] diff --git a/roles/nagios/defaults/main.yml b/roles/nagios/defaults/main.yml new file mode 100644 index 0000000..9ff24b2 --- /dev/null +++ b/roles/nagios/defaults/main.yml @@ -0,0 +1,27 @@ +#!/usr/bin/env ansible-playbook +# vim:ft=ansible: +nagios_data_dir: /data/nagios +nagios_admin_pass: foobar +nagios_timezone: "America/Chicago" +# nagios_contacts: +# - name: Bob +# host_notification_commands: notify-host-by-email +# service_notification_commands: notify-service-by-email +# extra: +# - key: email +# value: bob@mysite.example.com +nagios_contacts: [] +# nagios_commands: +# - name: check_thing +# command: "$USER1$/check_thing -H $HOSTADDRESS% $ARG1$ +nagios_commands: [] +# nagios_services: +# - name: HTTP +# command: check_http +# hostgroup: tag-nagios-checkhttp +# - name: SSH +# command: check_ssh +# - name: Docker +# command: foo +# hostgroup: "!tag-no-docker" +nagios_services: [] diff --git a/roles/nagios/handlers/main.yml b/roles/nagios/handlers/main.yml new file mode 100644 index 0000000..a0d151d --- /dev/null +++ b/roles/nagios/handlers/main.yml @@ -0,0 +1,4 @@ +#!/usr/bin/env ansible-playbook +# vim:ft=ansible: +- name: restart nagios + docker_container: name=nagios state=started restart=yes diff --git a/roles/nagios/tasks/main.yml b/roles/nagios/tasks/main.yml new file mode 100644 index 0000000..8e330a0 --- /dev/null +++ b/roles/nagios/tasks/main.yml @@ -0,0 +1,35 @@ +# vim:ft=ansible: +- name: assure data directory for nagios + file: path="{{ nagios_data_dir }}" state=directory mode=0755 + tags: [ nagios ] +- name: docker deploy nagios + docker_container: + name: nagios + image: jasonrivers/nagios + pull: yes + restart_policy: unless-stopped + state: started + env: + NAGIOSADMIN_USER: admin + NAGIOSADMIN_PASS: "{{ nagios_admin_pass }}" + NAGIOS_TIMEZONE: "{{ nagios_timezone }}" + networks: + - name: web + aliases: [ "nagios" ] + volumes: + - "{{ nagios_data_dir }}/etc:/opt/nagios/etc" + - "{{ nagios_data_dir }}/var:/opt/nagios/var" + - "{{ nagios_data_dir }}/plugins:/opt/Custom-Nagios-Plugins" + - "{{ nagios_data_dir }}/nagiosgraph/var:/opt/nagiosgraph/var" + - "{{ nagios_data_dir }}/nagiosgraph/etc:/opt/nagiosgraph/etc" + - /dev/null:/opt/nagios/bin/nsca + - /dev/null:/opt/nagios/bin/send_nsca + tags: [ docker, nagios ] +- name: template out config for nagios + template: src=nagios-ansible.cfg.j2 dest="{{ nagios_data_dir }}/etc/objects/ansible.cfg" owner=root group=root mode=0644 + tags: [ nagios, template ] + notify: restart nagios +- name: assure config file is loaded + lineinfile: path="{{ nagios_data_dir }}/etc/nagios.cfg" line='cfg_file=/opt/nagios/etc/objects/ansible.cfg' + tags: [ nagios, template ] + notify: restart nagios diff --git a/playbooks/tasks/web/nagios-ansible.cfg.j2 b/roles/nagios/templates/nagios-ansible.cfg.j2 similarity index 100% rename from playbooks/tasks/web/nagios-ansible.cfg.j2 rename to roles/nagios/templates/nagios-ansible.cfg.j2