Refactor Nagios checks into check_by_ssh instead of NRPE

I was never particularly fond of having a random one-off daemon doing my RCE. Sure, it offers some protection, but limiting my exposure to the open internet is far more ideal.

I have tremendously more trust in the OpenSSH project than I do in Nagios. And for that reason, I'll be deprecating NRPE and shredding config files once these plays clean up
This commit is contained in:
Salt 2021-09-07 14:27:23 -05:00
parent b38bb4bf62
commit bad192e93e
4 changed files with 47 additions and 22 deletions

30
playbooks/tags_nagios.yml Executable file
View File

@ -0,0 +1,30 @@
#!/usr/bin/env ansible-playbook
# vim:ft=ansible:
---
- hosts: tags_nagios
roles:
- role: git
vars:
git_repos:
- repo: https://git.desu.ltd/salt/monitoring-scripts
dest: /usr/local/bin/monitoring-scripts
tags: [ nagios ]
tasks:
- name: assure nagios plugin packages
apt: name=monitoring-plugins,nagios-plugins-contrib
tags: [ nagios ]
- name: assure nagios user
user: name=nagios-checker state=present system=yes
tags: [ nagios ]
- name: assure nagios user ssh key
authorized_key:
user: nagios-checker
state: present
key: "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKNavw28C0mKIQVRLQDW2aoovliU1XCGaenDhIMwumK/ Nagios monitoring"
tags: [ nagios ]
- hosts: all
tasks:
- name: disable nagios user when not tagged
user: name=nagios-checker state=absent remove=yes
when: "'tags_nagios' not in group_names"
tags: [ nagios ]

View File

@ -53,7 +53,7 @@ define hostgroup {
{% for command in nagios_commands %}
define command {
command_name {{ command.name }}
command_line {{ command.line }}
command_line {{ command.command }}
{% if command.extra is defined %}
{% for kvp in command.extra %}
{{ kvp.key }} {{ kvp.value }}
@ -71,7 +71,7 @@ define service {
use ansible-generic-service
service_description {{ service.name }}
check_command {{ service.command }}
hostgroup_name {{ service.hostgroup }}
hostgroup_name {{ service.hostgroup | default('ansible', true) }}
{% if service.extra is defined %}
{% for kvp in service.extra %}
{{ kvp.key }} {{ kvp.value }}

View File

@ -9,6 +9,8 @@
# This command is included in the container image
- name: check_nrpe
command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$"
- name: check_by_ssh
command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\""
nagios_services:
# Agentless checks
- name: HTTP
@ -19,44 +21,36 @@
hostgroup: nagios-checkhttp
- name: SSH
command: check_ssh
hostgroup: ansible
# Agented checks
# check_by_ssh checks
- name: CPU Load
command: check_nrpe!check_load
hostgroup: nagios-nrpe
command: check_by_ssh!/usr/lib/nagios/plugins/check_load -r -w 0.8,0.8,0.8 -c 1.0,0.9,0.9
- name: Disk Usage
command: check_nrpe!check_disk_all
hostgroup: nagios-nrpe
command: check_by_ssh!/usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/'
- name: Reboot Required
command: check_nrpe!check_reboot_required
hostgroup: nagios-nrpe
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required
- name: Unit backup.service
command: check_nrpe!check_systemd_backup_service
hostgroup: nagios-nrpe
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service
- name: Unit backup.timer
command: check_nrpe!check_systemd_backup_timer
hostgroup: nagios-nrpe
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer
- name: Unit docker.service
command: check_nrpe!check_systemd_docker_service
hostgroup: nagios-nrpe
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service
- name: Users
command: check_nrpe!check_users
hostgroup: nagios-nrpe
command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5
# Tag-specific checks
# ansible-pull
- name: Unit ansible-pull.service
command: check_nrpe!check_systemd_ansiblepull_service
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.service
hostgroup: ansible-pull
- name: Unit ansible-pull.timer
command: check_nrpe!check_systemd_ansiblepull_timer
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.timer
hostgroup: ansible-pull
# nagios-checkpgsql
- name: PostgreSQL
command: check_nrpe!check_pgsql
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5"
hostgroup: nagios-checkpgsql
# nagios-nrpeswap
- name: Swap Usage
command: check_nrpe!check_swap
command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10%
hostgroup: nagios-nrpeswap
register: config
tags: [ nagios, template ]

View File

@ -9,6 +9,7 @@
# Tags for fundamental services
- import_playbook: playbooks/tags_zerotier.yml
- import_playbook: playbooks/tags_snmp.yml
- import_playbook: playbooks/tags_nagios.yml
- import_playbook: playbooks/tags_nagios-nrpe.yml
# Device roles
- import_playbook: playbooks/device_roles_pik8s-storage.yml