Compare commits
4 Commits
320427cba4
...
ca9882adde
Author | SHA1 | Date | |
---|---|---|---|
ca9882adde | |||
e63898f328 | |||
12f187e1e2 | |||
9cad3d4867 |
@ -166,6 +166,149 @@ secret_nagios_matrix_token: !vault |
|
|||||||
6433376138386531380a383762393137613738643538343438633730313135613730613139393536
|
6433376138386531380a383762393137613738643538343438633730313135613730613139393536
|
||||||
35666133666262383862663637623738643836383633653864626231623034613662646563623936
|
35666133666262383862663637623738643836383633653864626231623034613662646563623936
|
||||||
3763356331333561383833386162616664376335333139376363
|
3763356331333561383833386162616664376335333139376363
|
||||||
|
nagios_contacts:
|
||||||
|
- name: matrix
|
||||||
|
host_notification_commands: notify-host-by-matrix
|
||||||
|
service_notification_commands: notify-service-by-matrix
|
||||||
|
host_notification_period: ansible-not-late-at-night
|
||||||
|
service_notification_period: ansible-not-late-at-night
|
||||||
|
extra:
|
||||||
|
- key: contactgroups
|
||||||
|
value: ansible
|
||||||
|
- name: salt
|
||||||
|
host_notification_commands: notify-host-by-email
|
||||||
|
service_notification_commands: notify-service-by-email
|
||||||
|
extra:
|
||||||
|
- key: email
|
||||||
|
value: alerts@babor.tech
|
||||||
|
nagios_commands:
|
||||||
|
# This command is included in the container image
|
||||||
|
- name: check_nrpe
|
||||||
|
command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$"
|
||||||
|
- name: check_by_ssh
|
||||||
|
command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -t 30 -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\""
|
||||||
|
- name: notify-host-by-matrix
|
||||||
|
command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\n$HOSTNAME$ is $HOSTSTATE$\\nAddress: $HOSTADDRESS$\\nInfo: $HOSTOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix"
|
||||||
|
- name: notify-service-by-matrix
|
||||||
|
command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\nService $HOSTALIAS$ - $SERVICEDESC$ is $SERVICESTATE$\\nInfo: $SERVICEOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix"
|
||||||
|
nagios_services:
|
||||||
|
# Agentless checks
|
||||||
|
- name: HTTP
|
||||||
|
command: check_http
|
||||||
|
hostgroup: tag-nagios-checkhttp
|
||||||
|
- name: HTTPS
|
||||||
|
command: check_http!--ssl
|
||||||
|
hostgroup: tag-nagios-checkhttp
|
||||||
|
- name: SSH
|
||||||
|
command: check_ssh
|
||||||
|
# check_by_ssh checks
|
||||||
|
- name: CPU Utilization
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90
|
||||||
|
- name: DNS Resolution
|
||||||
|
command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv
|
||||||
|
- name: Executables in tmp
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_executables_in_tmp
|
||||||
|
- name: Last Ansible Play
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 432000 -c 604800
|
||||||
|
- name: Memory Usage
|
||||||
|
command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 10% -c 5%
|
||||||
|
hostgroup: "ansible,!tag-prov-zfs"
|
||||||
|
- name: Ping Self over DNS
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname
|
||||||
|
- name: Reboot Required
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required
|
||||||
|
- name: Unit atd.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service
|
||||||
|
- name: Unit backup.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service
|
||||||
|
hostgroup: "ansible,!role-hypervisor"
|
||||||
|
- name: Unit backup.timer
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer
|
||||||
|
hostgroup: "ansible,!role-hypervisor"
|
||||||
|
- name: Unit cron.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service
|
||||||
|
- name: Unit dbus.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service
|
||||||
|
- name: Unit ssh.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service
|
||||||
|
- name: Unit systemd-resolved.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service
|
||||||
|
hostgroup: "ansible,!role-hypervisor"
|
||||||
|
- name: Users
|
||||||
|
command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5
|
||||||
|
# Privileged checks
|
||||||
|
# Required because check_disk may attempt to get the free space of
|
||||||
|
# restricted mountpoints
|
||||||
|
- name: Disk Usage
|
||||||
|
command: check_by_ssh!/usr/bin/sudo /usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/' -I '^/tmp/.mount_' -I '^/dev/loop'
|
||||||
|
# Device type checks
|
||||||
|
# R720
|
||||||
|
- name: CPU0 Temperature
|
||||||
|
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0000
|
||||||
|
hostgroup: device-type-r720
|
||||||
|
- name: CPU1 Temperature
|
||||||
|
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0001
|
||||||
|
hostgroup: device-type-r720
|
||||||
|
# Pi 4 4G
|
||||||
|
- name: CPU Temperature
|
||||||
|
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor cpu_thermal-virtual-0
|
||||||
|
hostgroup: device-type-pi4b-2g,device-type-pi4b-4g,device-type-pi4b-4g-storage
|
||||||
|
# Device role checks
|
||||||
|
# hypervisor (which is assumed to be Proxmox)
|
||||||
|
- name: PVE Unit pve-firewall.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service
|
||||||
|
hostgroup: role-hypervisor
|
||||||
|
- name: PVE Unit spiceproxy.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service
|
||||||
|
hostgroup: role-hypervisor
|
||||||
|
- name: PVE Unit pve-ha-crm.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service
|
||||||
|
hostgroup: role-hypervisor
|
||||||
|
- name: PVE Unit pvedaemon.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service
|
||||||
|
hostgroup: role-hypervisor
|
||||||
|
- name: PVE Unit pvefw-logger.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service
|
||||||
|
hostgroup: role-hypervisor
|
||||||
|
- name: PVE Unit pveproxy.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service
|
||||||
|
hostgroup: role-hypervisor
|
||||||
|
- name: PVE Unit pve-cluster.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service
|
||||||
|
hostgroup: role-hypervisor
|
||||||
|
- name: PVE Unit pvestatd.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service
|
||||||
|
hostgroup: role-hypervisor
|
||||||
|
# Tag-specific checks
|
||||||
|
# docker
|
||||||
|
- name: Unit docker.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service
|
||||||
|
hostgroup: "ansible,!tag-no-docker"
|
||||||
|
- name: Docker Status
|
||||||
|
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_docker --no-ok --status running
|
||||||
|
hostgroup: tag-nagios-checkdocker
|
||||||
|
# nagios-checkpgsql
|
||||||
|
- name: PSQL
|
||||||
|
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5"
|
||||||
|
hostgroup: tag-nagios-checkpgsql
|
||||||
|
- name: PSQL Connections
|
||||||
|
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0"
|
||||||
|
hostgroup: tag-nagios-checkpgsql
|
||||||
|
# https://rhaas.blogspot.com/2020/02/useless-vacuuming.html
|
||||||
|
- name: PSQL Old Xacts
|
||||||
|
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select count(*)::float from pg_prepared_xacts where age(transaction) > 5000000' -W 500-1000 -C 1000-1000000"
|
||||||
|
hostgroup: tag-nagios-checkpgsql
|
||||||
|
- name: Unit postgresql.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service
|
||||||
|
hostgroup: tag-nagios-checkpgsql
|
||||||
|
# nagios-checkswap
|
||||||
|
- name: Swap Usage
|
||||||
|
command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
||||||
|
hostgroup: tag-nagios-checkswap
|
||||||
|
# zerotier
|
||||||
|
- name: Unit zerotier-one.service
|
||||||
|
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service
|
||||||
|
hostgroup: tag-zt-personal
|
||||||
|
|
||||||
# For Netbox
|
# For Netbox
|
||||||
secret_netbox_user_pass: !vault |
|
secret_netbox_user_pass: !vault |
|
||||||
|
@ -94,6 +94,8 @@
|
|||||||
# Public
|
# Public
|
||||||
- record: git.desu.ltd
|
- record: git.desu.ltd
|
||||||
value: vm-general-1.ashburn.mgmt.desu.ltd
|
value: vm-general-1.ashburn.mgmt.desu.ltd
|
||||||
|
- record: grafana.desu.ltd
|
||||||
|
value: vm-general-1.ashburn.mgmt.desu.ltd
|
||||||
- record: matrix.desu.ltd
|
- record: matrix.desu.ltd
|
||||||
value: vm-general-1.ashburn.mgmt.desu.ltd
|
value: vm-general-1.ashburn.mgmt.desu.ltd
|
||||||
- record: movie.desu.ltd
|
- record: movie.desu.ltd
|
||||||
@ -104,6 +106,8 @@
|
|||||||
value: vm-general-1.ashburn.mgmt.desu.ltd
|
value: vm-general-1.ashburn.mgmt.desu.ltd
|
||||||
- record: netbox.desu.ltd
|
- record: netbox.desu.ltd
|
||||||
value: vm-general-1.ashburn.mgmt.desu.ltd
|
value: vm-general-1.ashburn.mgmt.desu.ltd
|
||||||
|
- record: prometheus.desu.ltd
|
||||||
|
value: vm-general-1.ashburn.mgmt.desu.ltd
|
||||||
# Public media stuff
|
# Public media stuff
|
||||||
- record: prowlarr.media.desu.ltd
|
- record: prowlarr.media.desu.ltd
|
||||||
value: vm-general-1.ashburn.mgmt.desu.ltd
|
value: vm-general-1.ashburn.mgmt.desu.ltd
|
||||||
|
@ -16,20 +16,25 @@
|
|||||||
- name: include tasks for applications
|
- name: include tasks for applications
|
||||||
include_tasks: tasks/{{ item }}
|
include_tasks: tasks/{{ item }}
|
||||||
with_items:
|
with_items:
|
||||||
|
# Applications
|
||||||
- app/gitlab-runner.yml
|
- app/gitlab-runner.yml
|
||||||
- app/redis.yml
|
- app/redis.yml
|
||||||
|
# Frontend web services
|
||||||
- web/9iron.yml
|
- web/9iron.yml
|
||||||
- web/desultd.yml
|
- web/desultd.yml
|
||||||
- web/element-web.yml
|
- web/element-web.yml
|
||||||
- web/gitea.yml
|
- web/gitea.yml
|
||||||
|
- web/grafana.yml
|
||||||
- web/netbox.yml
|
- web/netbox.yml
|
||||||
- web/nextcloud.yml
|
- web/nextcloud.yml
|
||||||
|
- web/synapse.yml
|
||||||
|
# Backend web services
|
||||||
- web/prowlarr.yml
|
- web/prowlarr.yml
|
||||||
- web/radarr.yml
|
- web/radarr.yml
|
||||||
- web/sonarr.yml
|
- web/sonarr.yml
|
||||||
- web/srv.yml
|
- web/srv.yml
|
||||||
- web/synapse.yml
|
|
||||||
- web/transmission.yml
|
- web/transmission.yml
|
||||||
|
# Games
|
||||||
- game/factorio.yml
|
- game/factorio.yml
|
||||||
- game/minecraft-createfarming.yml
|
- game/minecraft-createfarming.yml
|
||||||
- game/minecraft-direwolf20.yml
|
- game/minecraft-direwolf20.yml
|
||||||
@ -53,159 +58,26 @@
|
|||||||
- repo: https://git.desu.ltd/salt/gitea-custom
|
- repo: https://git.desu.ltd/salt/gitea-custom
|
||||||
dest: /data/gitea/data/gitea/custom
|
dest: /data/gitea/data/gitea/custom
|
||||||
tags: [ web, git ]
|
tags: [ web, git ]
|
||||||
|
- role: prometheus
|
||||||
|
tags: [ prometheus, monitoring ]
|
||||||
- role: nagios
|
- role: nagios
|
||||||
vars:
|
vars:
|
||||||
|
# Definitions for contacts and checks are defined in inventory vars
|
||||||
|
# See group_vars/all.yml if you need to change those
|
||||||
nagios_matrix_server: "https://matrix.desu.ltd"
|
nagios_matrix_server: "https://matrix.desu.ltd"
|
||||||
nagios_matrix_room: "!NWNCKlNmOTcarMcMIh:desu.ltd"
|
nagios_matrix_room: "!NWNCKlNmOTcarMcMIh:desu.ltd"
|
||||||
nagios_matrix_token: "{{ secret_nagios_matrix_token }}"
|
nagios_matrix_token: "{{ secret_nagios_matrix_token }}"
|
||||||
nagios_data_dir: /data/nagios
|
nagios_data_dir: /data/nagios
|
||||||
nagios_admin_pass: "{{ secret_nagios_admin_pass }}"
|
nagios_admin_pass: "{{ secret_nagios_admin_pass }}"
|
||||||
nagios_contacts:
|
|
||||||
- name: matrix
|
|
||||||
host_notification_commands: notify-host-by-matrix
|
|
||||||
service_notification_commands: notify-service-by-matrix
|
|
||||||
host_notification_period: ansible-not-late-at-night
|
|
||||||
service_notification_period: ansible-not-late-at-night
|
|
||||||
extra:
|
|
||||||
- key: contactgroups
|
|
||||||
value: ansible
|
|
||||||
- name: salt
|
|
||||||
host_notification_commands: notify-host-by-email
|
|
||||||
service_notification_commands: notify-service-by-email
|
|
||||||
extra:
|
|
||||||
- key: email
|
|
||||||
value: alerts@babor.tech
|
|
||||||
nagios_commands:
|
|
||||||
# This command is included in the container image
|
|
||||||
- name: check_nrpe
|
|
||||||
command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$"
|
|
||||||
- name: check_by_ssh
|
|
||||||
command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -t 30 -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\""
|
|
||||||
- name: notify-host-by-matrix
|
|
||||||
command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\n$HOSTNAME$ is $HOSTSTATE$\\nAddress: $HOSTADDRESS$\\nInfo: $HOSTOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix"
|
|
||||||
- name: notify-service-by-matrix
|
|
||||||
command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\nService $HOSTALIAS$ - $SERVICEDESC$ is $SERVICESTATE$\\nInfo: $SERVICEOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix"
|
|
||||||
nagios_services:
|
|
||||||
# Agentless checks
|
|
||||||
- name: HTTP
|
|
||||||
command: check_http
|
|
||||||
hostgroup: tag-nagios-checkhttp
|
|
||||||
- name: HTTPS
|
|
||||||
command: check_http!--ssl
|
|
||||||
hostgroup: tag-nagios-checkhttp
|
|
||||||
- name: SSH
|
|
||||||
command: check_ssh
|
|
||||||
# check_by_ssh checks
|
|
||||||
- name: CPU Utilization
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90
|
|
||||||
- name: DNS Resolution
|
|
||||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv
|
|
||||||
- name: Executables in tmp
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_executables_in_tmp
|
|
||||||
- name: Last Ansible Play
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 432000 -c 604800
|
|
||||||
- name: Memory Usage
|
|
||||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 10% -c 5%
|
|
||||||
hostgroup: "ansible,!tag-prov-zfs"
|
|
||||||
- name: Ping Self over DNS
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname
|
|
||||||
- name: Reboot Required
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required
|
|
||||||
- name: Unit atd.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service
|
|
||||||
- name: Unit backup.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service
|
|
||||||
hostgroup: "ansible,!role-hypervisor"
|
|
||||||
- name: Unit backup.timer
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer
|
|
||||||
hostgroup: "ansible,!role-hypervisor"
|
|
||||||
- name: Unit cron.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service
|
|
||||||
- name: Unit dbus.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service
|
|
||||||
- name: Unit ssh.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service
|
|
||||||
- name: Unit systemd-resolved.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service
|
|
||||||
hostgroup: "ansible,!role-hypervisor"
|
|
||||||
- name: Users
|
|
||||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5
|
|
||||||
# Privileged checks
|
|
||||||
# Required because check_disk may attempt to get the free space of
|
|
||||||
# restricted mountpoints
|
|
||||||
- name: Disk Usage
|
|
||||||
command: check_by_ssh!/usr/bin/sudo /usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/' -I '^/tmp/.mount_' -I '^/dev/loop'
|
|
||||||
# Device type checks
|
|
||||||
# R720
|
|
||||||
- name: CPU0 Temperature
|
|
||||||
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0000
|
|
||||||
hostgroup: device-type-r720
|
|
||||||
- name: CPU1 Temperature
|
|
||||||
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0001
|
|
||||||
hostgroup: device-type-r720
|
|
||||||
# Pi 4 4G
|
|
||||||
- name: CPU Temperature
|
|
||||||
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor cpu_thermal-virtual-0
|
|
||||||
hostgroup: device-type-pi4b-2g,device-type-pi4b-4g,device-type-pi4b-4g-storage
|
|
||||||
# Device role checks
|
|
||||||
# hypervisor (which is assumed to be Proxmox)
|
|
||||||
- name: PVE Unit pve-firewall.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service
|
|
||||||
hostgroup: role-hypervisor
|
|
||||||
- name: PVE Unit spiceproxy.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service
|
|
||||||
hostgroup: role-hypervisor
|
|
||||||
- name: PVE Unit pve-ha-crm.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service
|
|
||||||
hostgroup: role-hypervisor
|
|
||||||
- name: PVE Unit pvedaemon.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service
|
|
||||||
hostgroup: role-hypervisor
|
|
||||||
- name: PVE Unit pvefw-logger.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service
|
|
||||||
hostgroup: role-hypervisor
|
|
||||||
- name: PVE Unit pveproxy.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service
|
|
||||||
hostgroup: role-hypervisor
|
|
||||||
- name: PVE Unit pve-cluster.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service
|
|
||||||
hostgroup: role-hypervisor
|
|
||||||
- name: PVE Unit pvestatd.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service
|
|
||||||
hostgroup: role-hypervisor
|
|
||||||
# Tag-specific checks
|
|
||||||
# docker
|
|
||||||
- name: Unit docker.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service
|
|
||||||
hostgroup: "ansible,!tag-no-docker"
|
|
||||||
- name: Docker Status
|
|
||||||
command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_docker --no-ok --status running
|
|
||||||
hostgroup: tag-nagios-checkdocker
|
|
||||||
# nagios-checkpgsql
|
|
||||||
- name: PSQL
|
|
||||||
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5"
|
|
||||||
hostgroup: tag-nagios-checkpgsql
|
|
||||||
- name: PSQL Connections
|
|
||||||
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0"
|
|
||||||
hostgroup: tag-nagios-checkpgsql
|
|
||||||
# https://rhaas.blogspot.com/2020/02/useless-vacuuming.html
|
|
||||||
- name: PSQL Old Xacts
|
|
||||||
command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select count(*)::float from pg_prepared_xacts where age(transaction) > 5000000' -W 500-1000 -C 1000-1000000"
|
|
||||||
hostgroup: tag-nagios-checkpgsql
|
|
||||||
- name: Unit postgresql.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service
|
|
||||||
hostgroup: tag-nagios-checkpgsql
|
|
||||||
# nagios-checkswap
|
|
||||||
- name: Swap Usage
|
|
||||||
command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10%
|
|
||||||
hostgroup: tag-nagios-checkswap
|
|
||||||
# zerotier
|
|
||||||
- name: Unit zerotier-one.service
|
|
||||||
command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service
|
|
||||||
hostgroup: tag-zt-personal
|
|
||||||
tags: [ nagios, no-auto ]
|
tags: [ nagios, no-auto ]
|
||||||
- role: ingress
|
- role: ingress
|
||||||
vars:
|
vars:
|
||||||
|
ingress_head: |
|
||||||
|
# Used by Grafana, required for its API or some shit
|
||||||
|
map $http_upgrade $connection_upgrade {
|
||||||
|
default upgrade;
|
||||||
|
'' close;
|
||||||
|
}
|
||||||
ingress_servers:
|
ingress_servers:
|
||||||
# desu.ltd
|
# desu.ltd
|
||||||
- name: desu.ltd
|
- name: desu.ltd
|
||||||
@ -221,6 +93,16 @@
|
|||||||
return 200 '{"m.homeserver":{"base_url":"https://matrix.desu.ltd"}}';
|
return 200 '{"m.homeserver":{"base_url":"https://matrix.desu.ltd"}}';
|
||||||
- name: git.desu.ltd
|
- name: git.desu.ltd
|
||||||
proxy_pass: http://gitea:3000
|
proxy_pass: http://gitea:3000
|
||||||
|
- name: grafana.desu.ltd
|
||||||
|
proxy_pass: http://grafana:3000
|
||||||
|
locations:
|
||||||
|
- location: "/api/live/"
|
||||||
|
contents: |
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Upgrade $http_upgrade;
|
||||||
|
proxy_set_header Connection $connection_upgrade;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_pass http://grafana:3000;
|
||||||
- name: matrix.desu.ltd
|
- name: matrix.desu.ltd
|
||||||
proxies:
|
proxies:
|
||||||
- location: "~* ^(\/_matrix|\/_synapse|\/client|\/health)"
|
- location: "~* ^(\/_matrix|\/_synapse|\/client|\/health)"
|
||||||
@ -245,6 +127,16 @@
|
|||||||
try_files $uri $uri/ =404;
|
try_files $uri $uri/ =404;
|
||||||
- name: netbox.desu.ltd
|
- name: netbox.desu.ltd
|
||||||
proxy_pass: http://netbox:8080
|
proxy_pass: http://netbox:8080
|
||||||
|
- name: prometheus.desu.ltd
|
||||||
|
directives:
|
||||||
|
- "allow {{ common_home_address }}/{{ common_home_address_mask }}"
|
||||||
|
- "allow 10.0.0.0/8"
|
||||||
|
- "allow 172.16.0.0/12"
|
||||||
|
- "allow 192.168.0.0/16"
|
||||||
|
# TODO: Replace this with a dynamically-generated list of public IPs from inv
|
||||||
|
- "allow 45.79.58.44/32" # bastion1.dallas.mgmt.desu.ltd
|
||||||
|
- "deny all"
|
||||||
|
proxy_pass: http://prometheus:9090
|
||||||
# desu.ltd media bullshit
|
# desu.ltd media bullshit
|
||||||
- name: prowlarr.media.desu.ltd
|
- name: prowlarr.media.desu.ltd
|
||||||
directives:
|
directives:
|
||||||
|
@ -35,6 +35,32 @@
|
|||||||
- /usr/local/bin/monitoring-scripts/check_docker
|
- /usr/local/bin/monitoring-scripts/check_docker
|
||||||
- /usr/local/bin/monitoring-scripts/check_temp
|
- /usr/local/bin/monitoring-scripts/check_temp
|
||||||
tags: [ nagios, sudo ]
|
tags: [ nagios, sudo ]
|
||||||
|
- name: assure prometheus node exporter
|
||||||
|
# https://github.com/prometheus/node_exporter
|
||||||
|
ansible.builtin.docker_container:
|
||||||
|
name: prometheus-node-exporter
|
||||||
|
image: quay.io/prometheus/node-exporter:latest
|
||||||
|
command:
|
||||||
|
- '--path.rootfs=/host'
|
||||||
|
network_mode: host
|
||||||
|
pid_mode: host
|
||||||
|
volumes:
|
||||||
|
- /:/host:ro,rslave
|
||||||
|
tags: [ prometheus ]
|
||||||
|
- name: assure prometheus cadvisor exporter
|
||||||
|
ansible.builtin.docker_container:
|
||||||
|
name: prometheus-cadvisor-exporter
|
||||||
|
image: gcr.io/cadvisor/cadvisor:latest
|
||||||
|
ports:
|
||||||
|
- 9101:8080/tcp
|
||||||
|
volumes:
|
||||||
|
- /:/rootfs:ro
|
||||||
|
- /var/run:/var/run:ro
|
||||||
|
- /sys:/sys:ro
|
||||||
|
- /var/lib/docker:/var/lib/docker:ro
|
||||||
|
- /dev/disk:/dev/disk:ro
|
||||||
|
devices:
|
||||||
|
- /dev/kmsg
|
||||||
- hosts: all
|
- hosts: all
|
||||||
gather_facts: no
|
gather_facts: no
|
||||||
tasks:
|
tasks:
|
||||||
|
30
playbooks/tasks/web/grafana.yml
Normal file
30
playbooks/tasks/web/grafana.yml
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
# vim:ft=ansible:
|
||||||
|
- name: ensure grafana dirs
|
||||||
|
ansible.builtin.file:
|
||||||
|
state: directory
|
||||||
|
owner: 472
|
||||||
|
group: 472
|
||||||
|
mode: "0750"
|
||||||
|
path: "{{ item }}"
|
||||||
|
with_items:
|
||||||
|
- /data/grafana/storage
|
||||||
|
- /data/grafana/logs
|
||||||
|
tags: [ docker, grafana, monitoring ]
|
||||||
|
- name: docker deploy grafana
|
||||||
|
docker_container:
|
||||||
|
name: grafana
|
||||||
|
image: grafana/grafana-oss:main
|
||||||
|
env:
|
||||||
|
TZ: "America/Chicago"
|
||||||
|
# This enables logging to STDOUT for log aggregators to more easily hook it
|
||||||
|
GF_LOG_MODE: "console file"
|
||||||
|
GF_SERVER_DOMAIN: "grafana.desu.ltd"
|
||||||
|
GF_SERVER_PROTOCOL: "http"
|
||||||
|
GF_SERVER_ROOT_URL: "https://grafana.desu.ltd"
|
||||||
|
networks:
|
||||||
|
- name: web
|
||||||
|
aliases: [ "grafana" ]
|
||||||
|
volumes:
|
||||||
|
- /data/grafana/storage:/var/lib/grafana
|
||||||
|
- /data/grafana/logs:/var/log/grafana
|
||||||
|
tags: [ docker, grafana, monitoring ]
|
@ -1,3 +1,7 @@
|
|||||||
|
{% if ingress_head is defined %}
|
||||||
|
{{ ingress_head }}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% for server in ingress_servers %}
|
{% for server in ingress_servers %}
|
||||||
server {
|
server {
|
||||||
{% if loop.index == 1 %}
|
{% if loop.index == 1 %}
|
||||||
|
5
roles/prometheus/handlers/main.yml
Normal file
5
roles/prometheus/handlers/main.yml
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#!/usr/bin/env ansible-playbook
|
||||||
|
# vim:ft=ansible:
|
||||||
|
- name: restart prometheus container
|
||||||
|
docker_container: name="prometheus" state=started restart=yes
|
||||||
|
become: yes
|
33
roles/prometheus/tasks/main.yml
Normal file
33
roles/prometheus/tasks/main.yml
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
# vim:ft=ansible:
|
||||||
|
- name: ensure prometheus dirs
|
||||||
|
ansible.builtin.file:
|
||||||
|
state: directory
|
||||||
|
owner: 5476
|
||||||
|
group: 5476
|
||||||
|
mode: "0750"
|
||||||
|
path: "{{ item }}"
|
||||||
|
with_items:
|
||||||
|
- /data/prometheus/config
|
||||||
|
- /data/prometheus/data
|
||||||
|
notify: restart prometheus container
|
||||||
|
- name: template out configuration file
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: prometheus.yml.j2
|
||||||
|
owner: 5476
|
||||||
|
group: 5476
|
||||||
|
mode: "0640"
|
||||||
|
dest: /data/prometheus/config/prometheus.yml
|
||||||
|
notify: restart prometheus container
|
||||||
|
- name: docker deploy prometheus
|
||||||
|
community.docker.docker_container:
|
||||||
|
name: prometheus
|
||||||
|
image: prom/prometheus:latest
|
||||||
|
user: 5476:5476
|
||||||
|
env:
|
||||||
|
TZ: "America/Chicago"
|
||||||
|
networks:
|
||||||
|
- name: web
|
||||||
|
aliases: [ "prometheus" ]
|
||||||
|
volumes:
|
||||||
|
- /data/prometheus/config:/etc/prometheus
|
||||||
|
- /data/prometheus/data:/prometheus
|
26
roles/prometheus/templates/prometheus.yml.j2
Normal file
26
roles/prometheus/templates/prometheus.yml.j2
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# my global config
|
||||||
|
---
|
||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
evaluation_interval: 15s
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
# The job name is added as a label `job=<job_name>` to any timeseries
|
||||||
|
# scraped from this config.
|
||||||
|
- job_name: "prometheus"
|
||||||
|
# metrics_path defaults to '/metrics'
|
||||||
|
# scheme defaults to 'http'.
|
||||||
|
static_configs:
|
||||||
|
- targets: ["localhost:9090"]
|
||||||
|
- job_name: "node-exporter"
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
{% for host in groups['tags_nagios'] %}
|
||||||
|
- '{{ host }}:9100'
|
||||||
|
{% endfor %}
|
||||||
|
- job_name: "cadvisor-exporter"
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
{% for host in groups['tags_nagios'] %}
|
||||||
|
- '{{ host }}:9101'
|
||||||
|
{% endfor %}
|
Loading…
Reference in New Issue
Block a user