From bad192e93e80097c26e2d7b1df11354208d1f2ec Mon Sep 17 00:00:00 2001 From: Salt Date: Tue, 7 Sep 2021 14:27:23 -0500 Subject: [PATCH] Refactor Nagios checks into check_by_ssh instead of NRPE I was never particularly fond of having a random one-off daemon doing my RCE. Sure, it offers some protection, but limiting my exposure to the open internet is far more ideal. I have tremendously more trust in the OpenSSH project than I do in Nagios. And for that reason, I'll be deprecating NRPE and shredding config files once these plays clean up --- playbooks/tags_nagios.yml | 30 ++++++++++++++++++++ playbooks/tasks/web/nagios-ansible.cfg.j2 | 4 +-- playbooks/tasks/web/nagios.yml | 34 ++++++++++------------- site.yml | 1 + 4 files changed, 47 insertions(+), 22 deletions(-) create mode 100755 playbooks/tags_nagios.yml diff --git a/playbooks/tags_nagios.yml b/playbooks/tags_nagios.yml new file mode 100755 index 0000000..4a1d01a --- /dev/null +++ b/playbooks/tags_nagios.yml @@ -0,0 +1,30 @@ +#!/usr/bin/env ansible-playbook +# vim:ft=ansible: +--- +- hosts: tags_nagios + roles: + - role: git + vars: + git_repos: + - repo: https://git.desu.ltd/salt/monitoring-scripts + dest: /usr/local/bin/monitoring-scripts + tags: [ nagios ] + tasks: + - name: assure nagios plugin packages + apt: name=monitoring-plugins,nagios-plugins-contrib + tags: [ nagios ] + - name: assure nagios user + user: name=nagios-checker state=present system=yes + tags: [ nagios ] + - name: assure nagios user ssh key + authorized_key: + user: nagios-checker + state: present + key: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKNavw28C0mKIQVRLQDW2aoovliU1XCGaenDhIMwumK/ Nagios monitoring" + tags: [ nagios ] +- hosts: all + tasks: + - name: disable nagios user when not tagged + user: name=nagios-checker state=absent remove=yes + when: "'tags_nagios' not in group_names" + tags: [ nagios ] diff --git a/playbooks/tasks/web/nagios-ansible.cfg.j2 b/playbooks/tasks/web/nagios-ansible.cfg.j2 index 8e76064..28750cc 100644 --- a/playbooks/tasks/web/nagios-ansible.cfg.j2 +++ b/playbooks/tasks/web/nagios-ansible.cfg.j2 @@ -53,7 +53,7 @@ define hostgroup { {% for command in nagios_commands %} define command { command_name {{ command.name }} - command_line {{ command.line }} + command_line {{ command.command }} {% if command.extra is defined %} {% for kvp in command.extra %} {{ kvp.key }} {{ kvp.value }} @@ -71,7 +71,7 @@ define service { use ansible-generic-service service_description {{ service.name }} check_command {{ service.command }} - hostgroup_name {{ service.hostgroup }} + hostgroup_name {{ service.hostgroup | default('ansible', true) }} {% if service.extra is defined %} {% for kvp in service.extra %} {{ kvp.key }} {{ kvp.value }} diff --git a/playbooks/tasks/web/nagios.yml b/playbooks/tasks/web/nagios.yml index c7d0493..37d3acb 100644 --- a/playbooks/tasks/web/nagios.yml +++ b/playbooks/tasks/web/nagios.yml @@ -9,6 +9,8 @@ # This command is included in the container image - name: check_nrpe command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$" + - name: check_by_ssh + command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\"" nagios_services: # Agentless checks - name: HTTP @@ -19,44 +21,36 @@ hostgroup: nagios-checkhttp - name: SSH command: check_ssh - hostgroup: ansible - # Agented checks + # check_by_ssh checks - name: CPU Load - command: check_nrpe!check_load - hostgroup: nagios-nrpe + command: check_by_ssh!/usr/lib/nagios/plugins/check_load -r -w 0.8,0.8,0.8 -c 1.0,0.9,0.9 - name: Disk Usage - command: check_nrpe!check_disk_all - hostgroup: nagios-nrpe + command: check_by_ssh!/usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/' - name: Reboot Required - command: check_nrpe!check_reboot_required - hostgroup: nagios-nrpe + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required - name: Unit backup.service - command: check_nrpe!check_systemd_backup_service - hostgroup: nagios-nrpe + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service - name: Unit backup.timer - command: check_nrpe!check_systemd_backup_timer - hostgroup: nagios-nrpe + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer - name: Unit docker.service - command: check_nrpe!check_systemd_docker_service - hostgroup: nagios-nrpe + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service - name: Users - command: check_nrpe!check_users - hostgroup: nagios-nrpe + command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5 # Tag-specific checks # ansible-pull - name: Unit ansible-pull.service - command: check_nrpe!check_systemd_ansiblepull_service + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.service hostgroup: ansible-pull - name: Unit ansible-pull.timer - command: check_nrpe!check_systemd_ansiblepull_timer + command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.timer hostgroup: ansible-pull # nagios-checkpgsql - name: PostgreSQL - command: check_nrpe!check_pgsql + command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5" hostgroup: nagios-checkpgsql # nagios-nrpeswap - name: Swap Usage - command: check_nrpe!check_swap + command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10% hostgroup: nagios-nrpeswap register: config tags: [ nagios, template ] diff --git a/site.yml b/site.yml index dce6172..175d022 100755 --- a/site.yml +++ b/site.yml @@ -9,6 +9,7 @@ # Tags for fundamental services - import_playbook: playbooks/tags_zerotier.yml - import_playbook: playbooks/tags_snmp.yml +- import_playbook: playbooks/tags_nagios.yml - import_playbook: playbooks/tags_nagios-nrpe.yml # Device roles - import_playbook: playbooks/device_roles_pik8s-storage.yml