#!/usr/bin/env ansible-playbook # vim:ft=ansible: # Webservers --- - hosts: web1.dallas.mgmt.desu.ltd gather_facts: no module_defaults: docker_container: state: started restart_policy: unless-stopped pull: yes tasks: - name: ensure docker network docker_network: name=web tags: [ docker ] - name: include tasks for applications include_tasks: tasks/{{ item }} with_items: - app/redis.yml - web/9iron.yml - web/desultd.yml - web/gitea.yml - web/nextcloud.yml - web/srv.yml tags: [ always ] roles: - role: backup vars: backup_s3backup_list_extra: - /app/gitea/gitea - /data - /srv/desu.ltd backup_s3backup_exclude_list_extra: - /var/lib/gitea/log - /data/gitea/data/gitea/log tags: [ backup ] - role: git vars: git_repos: - repo: https://git.desu.ltd/salt/gitea-custom dest: /data/gitea/data/gitea/custom tags: [ web, git ] - role: ingress vars: ingress_servers: # desu.ltd - name: desu.ltd proxy_pass: http://desultd:80 locations: - location: /.well-known/matrix/server contents: | default_type application/json; return 200 '{"m.server":"matrix.desu.ltd:443"}'; - location: /.well-known/matrix/client contents: | default_type application/json; return 200 '{"m.homeserver":{"base_url":"https://matrix.desu.ltd"}}'; - name: git.desu.ltd proxy_pass: http://gitea:3000 - name: nc.desu.ltd directives: - "add_header Strict-Transport-Security \"max-age=31536000\"" - "client_max_body_size 0" proxy_pass: http://nextcloud:80 locations: - location: "^~ /.well-known" contents: | location = /.well-known/carddav { return 301 /remote.php/dav/; } location = /.well-known/caldav { return 301 /remote.php/dav/; } location ^~ /.well-known { return 301 /index.php$uri; } try_files $uri $uri/ =404; # 9iron - name: www.9iron.club directives: - "return 301 $scheme://9iron.club$request_uri" - name: 9iron.club proxy_pass: http://9iron:80 - name: srv.9iron.club proxy_pass: http://srv:80 tags: [ web, docker, ingress ] - hosts: web2.dallas.mgmt.desu.ltd gather_facts: no module_defaults: docker_container: state: started restart_policy: unless-stopped pull: yes pre_tasks: - name: ensure docker network docker_network: name=web tags: [ docker ] - name: include tasks for applications include_tasks: tasks/{{ item }} with_items: - app/redis.yml - web/jenkins.yml - web/libreddit.yml - web/pleroma.yml tags: [ always ] roles: - role: backup vars: backup_s3backup_list_extra: - /data tags: [ backup ] - role: ingress vars: ingress_servers: - name: cowfee.moe proxy_pass: http://pleroma:4000 - name: lr.cowfee.moe directives: - "access_log /dev/null" - "error_log /dev/null" proxy_pass: http://libreddit:8080 - name: jenkins.desu.ltd locations: - location: "/" contents: | proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-Proto https; proxy_set_header X-Forwarded-Port 443; proxy_pass http://jenkins:8080; - location: | ~ "^/static/[0-9a-fA-F]{8}\/(.*)$" contents: | rewrite "^/static/[0-9a-fA-F]{8}\/(.*)" /$1 last; - location: "/userContent" contents: | root /data/jenkins/home/; if (!-f $request_filename) { rewrite (.*) /$1 last; break; } tags: [ web, docker, ingress ] - hosts: web3.dallas.mgmt.desu.ltd gather_facts: no module_defaults: docker_container: state: started restart_policy: unless-stopped pull: yes tasks: - name: ensure docker network docker_network: name=web tags: [ docker ] - name: include tasks for applications include_tasks: tasks/{{ item }} with_items: - app/redis.yml - web/movienight.yml - web/netbox.yml tags: [ always ] roles: - role: backup vars: backup_s3backup_list_extra: - /data tags: [ backup ] # TODO: Replace this with Naemon(?) - role: nagios vars: nagios_matrix_server: "https://matrix.desu.ltd" nagios_matrix_room: "!NWNCKlNmOTcarMcMIh:desu.ltd" nagios_matrix_token: "{{ secret_nagios_matrix_token }}" nagios_data_dir: /data/nagios nagios_admin_pass: "{{ secret_nagios_admin_pass }}" nagios_contacts: - name: matrix host_notification_commands: notify-host-by-matrix service_notification_commands: notify-service-by-matrix host_notification_period: ansible-not-late-at-night service_notification_period: ansible-not-late-at-night extra: - key: contactgroups value: ansible - name: salt host_notification_commands: notify-host-by-email service_notification_commands: notify-service-by-email extra: - key: email value: alerts@babor.tech nagios_commands: # This command is included in the container image - name: check_nrpe command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$" - name: check_by_ssh command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -t 30 -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\"" - name: notify-host-by-matrix command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$ - $HOSTNAME$ is $HOSTSTATE$\\nAddress: $HOSTADDRESS$\\nInfo: $HOSTOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix" - name: notify-service-by-matrix command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$ - Service $HOSTALIAS$ - $SERVICEDESC$ is $SERVICESTATE$\\nInfo: $SERVICEOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix" nagios_services: # Agentless checks - name: HTTP command: check_http hostgroup: tag-nagios-checkhttp - name: HTTPS command: check_http!--ssl hostgroup: tag-nagios-checkhttp - name: SSH command: check_ssh # check_by_ssh checks - name: CPU Utilization command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90 - name: DNS Resolution command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv - name: Last Ansible Play command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 176400 -c 216000 - name: Memory Usage command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 10% -c 5% hostgroup: "ansible,!tag-prov-zfs" - name: Ping Self over DNS command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname - name: Reboot Required command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required - name: Unit atd.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service - name: Unit backup.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service hostgroup: "ansible,!role-hypervisor" - name: Unit backup.timer command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer hostgroup: "ansible,!role-hypervisor" - name: Unit cron.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service - name: Unit dbus.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service - name: Unit ssh.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service - name: Unit systemd-resolved.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service hostgroup: "ansible,!role-hypervisor" - name: Users command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5 # Privileged checks # Required because check_disk may attempt to get the free space of # restricted mountpoints - name: Disk Usage command: check_by_ssh!/usr/bin/sudo /usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/' -I'^/tmp/.mount_' # Device type checks # R720 - name: CPU0 Temperature command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0000 hostgroup: device-type-r720 - name: CPU1 Temperature command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0001 hostgroup: device-type-r720 # Pi 4 4G - name: CPU Temperature command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor cpu_thermal-virtual-0 hostgroup: device-type-pi4b-2g,device-type-pi4b-4g,device-type-pi4b-4g-storage # Device role checks # hypervisor (which is assumed to be Proxmox) - name: PVE Unit pve-firewall.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service hostgroup: role-hypervisor - name: PVE Unit spiceproxy.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service hostgroup: role-hypervisor - name: PVE Unit pve-ha-crm.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service hostgroup: role-hypervisor - name: PVE Unit pvedaemon.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service hostgroup: role-hypervisor - name: PVE Unit pvefw-logger.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service hostgroup: role-hypervisor - name: PVE Unit pveproxy.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service hostgroup: role-hypervisor - name: PVE Unit pve-cluster.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service hostgroup: role-hypervisor - name: PVE Unit pvestatd.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service hostgroup: role-hypervisor # Tag-specific checks # ansible-pull - name: Unit ansible-pull.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.service hostgroup: tag-ansible-pull - name: Unit ansible-pull.timer command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ansible-pull.timer hostgroup: tag-ansible-pull # docker - name: Unit docker.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service hostgroup: "ansible,!tag-no-docker" - name: Docker Status command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_docker --no-ok --status running hostgroup: tag-nagios-checkdocker - name: Docker CPU Usage command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_docker --no-ok --cpu 85:90 hostgroup: tag-nagios-checkdocker - name: Docker Memory Usage command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_docker --no-ok --memory 90:95:% hostgroup: tag-nagios-checkdocker # nagios-checkpgsql - name: PSQL command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5" hostgroup: tag-nagios-checkpgsql - name: PSQL Connections command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0" hostgroup: tag-nagios-checkpgsql # https://rhaas.blogspot.com/2020/02/useless-vacuuming.html - name: PSQL Old Xacts command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select count(*)::float from pg_prepared_xacts where age(transaction) > 5000000' -W 500-1000 -C 1000-1000000" hostgroup: tag-nagios-checkpgsql - name: Unit postgresql.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service hostgroup: tag-nagios-checkpgsql # nagios-checkswap - name: Swap Usage command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10% hostgroup: tag-nagios-checkswap # zerotier - name: Unit zerotier-one.service command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service hostgroup: tag-zt-personal tags: [ nagios ] - role: ingress vars: ingress_servers: - name: netbox.desu.ltd proxy_pass: http://netbox:8080 - name: nagios.desu.ltd proxy_pass: http://nagios:80 - name: movie.desu.ltd proxy_pass: http://movienight:8089 tags: [ web, docker, ingress ]