Move Nagios into its own role
It was getting way too big
This commit is contained in:
parent
a71071b321
commit
fcffd834a0
@ -130,8 +130,6 @@
|
||||
- app/redis.yml
|
||||
- web/movienight.yml
|
||||
- web/netbox.yml
|
||||
# TODO: Replace this with Naemon(?)
|
||||
- web/nagios.yml
|
||||
tags: [ always ]
|
||||
roles:
|
||||
- role: backup
|
||||
@ -139,6 +137,134 @@
|
||||
backup_s3backup_list_extra:
|
||||
- /data
|
||||
tags: [ backup ]
|
||||
# TODO: Replace this with Naemon(?)
|
||||
- role: nagios
|
||||
vars:
|
||||
nagios_data_dir: /data/nagios
|
||||
nagios_admin_pass: "{{ secret_nagios_admin_pass }}"
|
||||
nagios_contacts:
|
||||
- name: salt
|
||||
host_notification_commands: notify-host-by-email
|
||||
service_notification_commands: notify-service-by-email
|
||||
extra:
|
||||
- key: email
|
||||
value: rehashedsalt@cock.li
|
||||
nagios_commands:
|
||||
# This command is included in the container image
|
||||
- name: check_nrpe
|
||||
command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$"
|
||||
- name: check_by_ssh
|
||||
command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -t 30 -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\""
|
||||
nagios_services:
|
||||
# Agentless checks
|
||||
- name: HTTP
|
||||
command: check_http
|
||||
hostgroup: tag-nagios-checkhttp
|
||||
- name: HTTPS
|
||||
command: check_http!--ssl
|
||||
hostgroup: tag-nagios-checkhttp
|
||||
- name: SSH
|
||||
command: check_ssh
|
||||
# check_by_ssh checks
|
||||
- name: CPU Load
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_load -r -w 5,4,3 -c 7,6,5
|
||||
- name: CPU Utilization
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90
|
||||
- name: Disk Usage
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/'
|
||||
- name: DNS Resolution
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv
|
||||
- name: Last Ansible Play
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 93600 -c 129600
|
||||
- name: Memory Usage
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 20% -c 10%
|
||||
- name: Package Updates
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_packages
|
||||
extra:
|
||||
- key: notification_options
|
||||
value: c,r
|
||||
- name: Ping Self over DNS
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname
|
||||
- name: Reboot Required
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required
|
||||
- name: Unit atd.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service
|
||||
- name: Unit backup.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service
|
||||
hostgroup: "!role-hypervisor"
|
||||
- name: Unit backup.timer
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer
|
||||
hostgroup: "!role-hypervisor"
|
||||
- name: Unit cron.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service
|
||||
- name: Unit dbus.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service
|
||||
- name: Unit docker.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service
|
||||
hostgroup: "!tag-no-docker"
|
||||
- name: Unit ssh.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service
|
||||
- name: Unit systemd-resolved.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service
|
||||
hostgroup: "!role-hypervisor"
|
||||
- name: Users
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5
|
||||
# Device role checks
|
||||
# hypervisor (which is assumed to be Proxmox)
|
||||
- name: PVE Unit pve-firewall.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit spiceproxy.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit pve-ha-crm.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit pvedaemon.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit pvefw-logger.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit pveproxy.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit pve-cluster.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit pvestatd.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service
|
||||
hostgroup: role-hypervisor
|
||||
# Tag-specific checks
|
||||
# ansible-pull
|
||||
- name: Unit ansible-pull.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.service
|
||||
hostgroup: tag-ansible-pull
|
||||
- name: Unit ansible-pull.timer
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.timer
|
||||
hostgroup: tag-ansible-pull
|
||||
# docker
|
||||
# Strictly speaking not a tag, but it's best to keep it separated
|
||||
# TODO: Figure out how I'm going to implement Docker checks
|
||||
# nagios-checkpgsql
|
||||
- name: PSQL
|
||||
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5"
|
||||
hostgroup: tag-nagios-checkpgsql
|
||||
- name: PSQL Connections
|
||||
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0"
|
||||
hostgroup: tag-nagios-checkpgsql
|
||||
- name: Unit postgresql.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service
|
||||
hostgroup: tag-nagios-checkpgsql
|
||||
# nagios-checkswap
|
||||
- name: Swap Usage
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||
hostgroup: tag-nagios-checkswap
|
||||
# zerotier
|
||||
- name: Unit zerotier-one.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service
|
||||
hostgroup: tag-zerotier
|
||||
tags: [ nagios ]
|
||||
- role: ingress
|
||||
vars:
|
||||
ingress_servers:
|
||||
|
@ -1,158 +0,0 @@
|
||||
# vim:ft=ansible:
|
||||
- name: assure data directory for nagios
|
||||
file: path=/data/nagios state=directory mode=0755
|
||||
tags: [ nagios ]
|
||||
- name: template out config for nagios
|
||||
template: src=nagios-ansible.cfg.j2 dest=/data/nagios/etc/objects/ansible.cfg owner=root group=root mode=0644
|
||||
vars:
|
||||
nagios_contacts:
|
||||
- name: salt
|
||||
host_notification_commands: notify-host-by-email
|
||||
service_notification_commands: notify-service-by-email
|
||||
extra:
|
||||
- key: email
|
||||
value: rehashedsalt@cock.li
|
||||
nagios_commands:
|
||||
# This command is included in the container image
|
||||
- name: check_nrpe
|
||||
command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$"
|
||||
- name: check_by_ssh
|
||||
command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -t 30 -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\""
|
||||
nagios_services:
|
||||
# Agentless checks
|
||||
- name: HTTP
|
||||
command: check_http
|
||||
hostgroup: tag-nagios-checkhttp
|
||||
- name: HTTPS
|
||||
command: check_http!--ssl
|
||||
hostgroup: tag-nagios-checkhttp
|
||||
- name: SSH
|
||||
command: check_ssh
|
||||
# check_by_ssh checks
|
||||
- name: CPU Load
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_load -r -w 5,4,3 -c 7,6,5
|
||||
- name: CPU Utilization
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90
|
||||
- name: Disk Usage
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/'
|
||||
- name: DNS Resolution
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv
|
||||
- name: Last Ansible Play
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 93600 -c 129600
|
||||
- name: Memory Usage
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 20% -c 10%
|
||||
- name: Package Updates
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_packages
|
||||
extra:
|
||||
- key: notification_options
|
||||
value: c,r
|
||||
- name: Ping Self over DNS
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname
|
||||
- name: Reboot Required
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required
|
||||
- name: Unit atd.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service
|
||||
- name: Unit backup.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service
|
||||
hostgroup: "!role-hypervisor"
|
||||
- name: Unit backup.timer
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer
|
||||
hostgroup: "!role-hypervisor"
|
||||
- name: Unit cron.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service
|
||||
- name: Unit dbus.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service
|
||||
- name: Unit docker.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service
|
||||
hostgroup: "!tag-no-docker"
|
||||
- name: Unit ssh.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service
|
||||
- name: Unit systemd-resolved.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service
|
||||
hostgroup: "!role-hypervisor"
|
||||
- name: Users
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5
|
||||
# Device role checks
|
||||
# hypervisor (which is assumed to be Proxmox)
|
||||
- name: PVE Unit pve-firewall.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit spiceproxy.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit pve-ha-crm.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit pvedaemon.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit pvefw-logger.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit pveproxy.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit pve-cluster.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service
|
||||
hostgroup: role-hypervisor
|
||||
- name: PVE Unit pvestatd.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service
|
||||
hostgroup: role-hypervisor
|
||||
# Tag-specific checks
|
||||
# ansible-pull
|
||||
- name: Unit ansible-pull.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.service
|
||||
hostgroup: tag-ansible-pull
|
||||
- name: Unit ansible-pull.timer
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.timer
|
||||
hostgroup: tag-ansible-pull
|
||||
# docker
|
||||
# Strictly speaking not a tag, but it's best to keep it separated
|
||||
# TODO: Figure out how I'm going to implement Docker checks
|
||||
# nagios-checkpgsql
|
||||
- name: PSQL
|
||||
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5"
|
||||
hostgroup: tag-nagios-checkpgsql
|
||||
- name: PSQL Connections
|
||||
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0"
|
||||
hostgroup: tag-nagios-checkpgsql
|
||||
- name: Unit postgresql.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service
|
||||
hostgroup: tag-nagios-checkpgsql
|
||||
# nagios-checkswap
|
||||
- name: Swap Usage
|
||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||
hostgroup: tag-nagios-checkswap
|
||||
# zerotier
|
||||
- name: Unit zerotier-one.service
|
||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service
|
||||
hostgroup: tag-zerotier
|
||||
register: config
|
||||
tags: [ nagios, template ]
|
||||
- name: assure config file is loaded
|
||||
lineinfile: path=/data/nagios/etc/nagios.cfg line='cfg_file=/opt/nagios/etc/objects/ansible.cfg'
|
||||
tags: [ nagios, template ]
|
||||
- name: docker deploy nagios
|
||||
docker_container:
|
||||
name: nagios
|
||||
image: jasonrivers/nagios
|
||||
env:
|
||||
NAGIOSADMIN_USER: admin
|
||||
NAGIOSADMIN_PASS: "{{ secret_nagios_admin_pass }}"
|
||||
NAGIOS_TIMEZONE: "America/Chicago"
|
||||
networks:
|
||||
- name: web
|
||||
aliases: [ "nagios" ]
|
||||
volumes:
|
||||
- /data/nagios/etc:/opt/nagios/etc
|
||||
- /data/nagios/var:/opt/nagios/var
|
||||
- /data/nagios/plugins:/opt/Custom-Nagios-Plugins
|
||||
- /data/nagios/nagiosgraph/var:/opt/nagiosgraph/var
|
||||
- /data/nagios/nagiosgraph/etc:/opt/nagiosgraph/etc
|
||||
- /dev/null:/opt/nagios/bin/nsca
|
||||
- /dev/null:/opt/nagios/bin/send_nsca
|
||||
tags: [ docker, nagios ]
|
||||
- name: restart nagios
|
||||
docker_container: name=nagios state=started restart=yes
|
||||
when: config and config is changed
|
||||
tags: [ docker, nagios ]
|
27
roles/nagios/defaults/main.yml
Normal file
27
roles/nagios/defaults/main.yml
Normal file
@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env ansible-playbook
|
||||
# vim:ft=ansible:
|
||||
nagios_data_dir: /data/nagios
|
||||
nagios_admin_pass: foobar
|
||||
nagios_timezone: "America/Chicago"
|
||||
# nagios_contacts:
|
||||
# - name: Bob
|
||||
# host_notification_commands: notify-host-by-email
|
||||
# service_notification_commands: notify-service-by-email
|
||||
# extra:
|
||||
# - key: email
|
||||
# value: bob@mysite.example.com
|
||||
nagios_contacts: []
|
||||
# nagios_commands:
|
||||
# - name: check_thing
|
||||
# command: "$USER1$/check_thing -H $HOSTADDRESS% $ARG1$
|
||||
nagios_commands: []
|
||||
# nagios_services:
|
||||
# - name: HTTP
|
||||
# command: check_http
|
||||
# hostgroup: tag-nagios-checkhttp
|
||||
# - name: SSH
|
||||
# command: check_ssh
|
||||
# - name: Docker
|
||||
# command: foo
|
||||
# hostgroup: "!tag-no-docker"
|
||||
nagios_services: []
|
4
roles/nagios/handlers/main.yml
Normal file
4
roles/nagios/handlers/main.yml
Normal file
@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env ansible-playbook
|
||||
# vim:ft=ansible:
|
||||
- name: restart nagios
|
||||
docker_container: name=nagios state=started restart=yes
|
35
roles/nagios/tasks/main.yml
Normal file
35
roles/nagios/tasks/main.yml
Normal file
@ -0,0 +1,35 @@
|
||||
# vim:ft=ansible:
|
||||
- name: assure data directory for nagios
|
||||
file: path="{{ nagios_data_dir }}" state=directory mode=0755
|
||||
tags: [ nagios ]
|
||||
- name: docker deploy nagios
|
||||
docker_container:
|
||||
name: nagios
|
||||
image: jasonrivers/nagios
|
||||
pull: yes
|
||||
restart_policy: unless-stopped
|
||||
state: started
|
||||
env:
|
||||
NAGIOSADMIN_USER: admin
|
||||
NAGIOSADMIN_PASS: "{{ nagios_admin_pass }}"
|
||||
NAGIOS_TIMEZONE: "{{ nagios_timezone }}"
|
||||
networks:
|
||||
- name: web
|
||||
aliases: [ "nagios" ]
|
||||
volumes:
|
||||
- "{{ nagios_data_dir }}/etc:/opt/nagios/etc"
|
||||
- "{{ nagios_data_dir }}/var:/opt/nagios/var"
|
||||
- "{{ nagios_data_dir }}/plugins:/opt/Custom-Nagios-Plugins"
|
||||
- "{{ nagios_data_dir }}/nagiosgraph/var:/opt/nagiosgraph/var"
|
||||
- "{{ nagios_data_dir }}/nagiosgraph/etc:/opt/nagiosgraph/etc"
|
||||
- /dev/null:/opt/nagios/bin/nsca
|
||||
- /dev/null:/opt/nagios/bin/send_nsca
|
||||
tags: [ docker, nagios ]
|
||||
- name: template out config for nagios
|
||||
template: src=nagios-ansible.cfg.j2 dest="{{ nagios_data_dir }}/etc/objects/ansible.cfg" owner=root group=root mode=0644
|
||||
tags: [ nagios, template ]
|
||||
notify: restart nagios
|
||||
- name: assure config file is loaded
|
||||
lineinfile: path="{{ nagios_data_dir }}/etc/nagios.cfg" line='cfg_file=/opt/nagios/etc/objects/ansible.cfg'
|
||||
tags: [ nagios, template ]
|
||||
notify: restart nagios
|
Loading…
Reference in New Issue
Block a user