Move Prometheus master to its own container and deploy some scraping on each node

Add Prometheus
Add Grafana
2024-07-09 15:43:41 -05:00 · 2024-07-09 14:53:38 -05:00 · 2024-07-09 14:27:54 -05:00 · 2024-07-09 13:28:48 -05:00
9 changed files with 307 additions and 144 deletions
--- a/inventories/production/group_vars/all.yml
+++ b/inventories/production/group_vars/all.yml
@ -166,6 +166,149 @@ secret_nagios_matrix_token: !vault |
          6433376138386531380a383762393137613738643538343438633730313135613730613139393536
          35666133666262383862663637623738643836383633653864626231623034613662646563623936
          3763356331333561383833386162616664376335333139376363
+nagios_contacts:
+  - name: matrix
+    host_notification_commands: notify-host-by-matrix
+    service_notification_commands: notify-service-by-matrix
+    host_notification_period: ansible-not-late-at-night
+    service_notification_period: ansible-not-late-at-night
+    extra:
+      - key: contactgroups
+        value: ansible
+  - name: salt
+    host_notification_commands: notify-host-by-email
+    service_notification_commands: notify-service-by-email
+    extra:
+      - key: email
+        value: alerts@babor.tech
+nagios_commands:
+  # This command is included in the container image
+  - name: check_nrpe
+    command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$"
+  - name: check_by_ssh
+    command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -t 30 -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\""
+  - name: notify-host-by-matrix
+    command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\n$HOSTNAME$ is $HOSTSTATE$\\nAddress: $HOSTADDRESS$\\nInfo: $HOSTOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix"
+  - name: notify-service-by-matrix
+    command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\nService $HOSTALIAS$ - $SERVICEDESC$ is $SERVICESTATE$\\nInfo: $SERVICEOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix"
+nagios_services:
+  # Agentless checks
+  - name: HTTP
+    command: check_http
+    hostgroup: tag-nagios-checkhttp
+  - name: HTTPS
+    command: check_http!--ssl
+    hostgroup: tag-nagios-checkhttp
+  - name: SSH
+    command: check_ssh
+  # check_by_ssh checks
+  - name: CPU Utilization
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90
+  - name: DNS Resolution
+    command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv
+  - name: Executables in tmp
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_executables_in_tmp
+  - name: Last Ansible Play
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 432000 -c 604800
+  - name: Memory Usage
+    command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 10% -c 5%
+    hostgroup: "ansible,!tag-prov-zfs"
+  - name: Ping Self over DNS
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname
+  - name: Reboot Required
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required
+  - name: Unit atd.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service
+  - name: Unit backup.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service
+    hostgroup: "ansible,!role-hypervisor"
+  - name: Unit backup.timer
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer
+    hostgroup: "ansible,!role-hypervisor"
+  - name: Unit cron.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service
+  - name: Unit dbus.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service
+  - name: Unit ssh.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service
+  - name: Unit systemd-resolved.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service
+    hostgroup: "ansible,!role-hypervisor"
+  - name: Users
+    command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5
+  # Privileged checks
+  # Required because check_disk may attempt to get the free space of
+  # restricted mountpoints
+  - name: Disk Usage
+    command: check_by_ssh!/usr/bin/sudo /usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/' -I '^/tmp/.mount_' -I '^/dev/loop'
+  # Device type checks
+  # R720
+  - name: CPU0 Temperature
+    command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0000
+    hostgroup: device-type-r720
+  - name: CPU1 Temperature
+    command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0001
+    hostgroup: device-type-r720
+  # Pi 4 4G
+  - name: CPU Temperature
+    command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor cpu_thermal-virtual-0
+    hostgroup: device-type-pi4b-2g,device-type-pi4b-4g,device-type-pi4b-4g-storage
+  # Device role checks
+  # hypervisor (which is assumed to be Proxmox)
+  - name: PVE Unit pve-firewall.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service
+    hostgroup: role-hypervisor
+  - name: PVE Unit spiceproxy.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service
+    hostgroup: role-hypervisor
+  - name: PVE Unit pve-ha-crm.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service
+    hostgroup: role-hypervisor
+  - name: PVE Unit pvedaemon.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service
+    hostgroup: role-hypervisor
+  - name: PVE Unit pvefw-logger.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service
+    hostgroup: role-hypervisor
+  - name: PVE Unit pveproxy.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service
+    hostgroup: role-hypervisor
+  - name: PVE Unit pve-cluster.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service
+    hostgroup: role-hypervisor
+  - name: PVE Unit pvestatd.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service
+    hostgroup: role-hypervisor
+  # Tag-specific checks
+  # docker
+  - name: Unit docker.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service
+    hostgroup: "ansible,!tag-no-docker"
+  - name: Docker Status
+    command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_docker --no-ok --status running
+    hostgroup: tag-nagios-checkdocker
+  # nagios-checkpgsql
+  - name: PSQL
+    command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5"
+    hostgroup: tag-nagios-checkpgsql
+  - name: PSQL Connections
+    command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0"
+    hostgroup: tag-nagios-checkpgsql
+    # https://rhaas.blogspot.com/2020/02/useless-vacuuming.html
+  - name: PSQL Old Xacts
+    command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select count(*)::float from pg_prepared_xacts where age(transaction) > 5000000' -W 500-1000 -C 1000-1000000"
+    hostgroup: tag-nagios-checkpgsql
+  - name: Unit postgresql.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service
+    hostgroup: tag-nagios-checkpgsql
+  # nagios-checkswap
+  - name: Swap Usage
+    command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10%
+    hostgroup: tag-nagios-checkswap
+  # zerotier
+  - name: Unit zerotier-one.service
+    command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service
+    hostgroup: tag-zt-personal

 # For Netbox
 secret_netbox_user_pass: !vault |
--- a/playbooks/local_dns.yml
+++ b/playbooks/local_dns.yml
@ -94,6 +94,8 @@
        # Public
        - record: git.desu.ltd
          value: vm-general-1.ashburn.mgmt.desu.ltd
+        - record: grafana.desu.ltd
+          value: vm-general-1.ashburn.mgmt.desu.ltd
        - record: matrix.desu.ltd
          value: vm-general-1.ashburn.mgmt.desu.ltd
        - record: movie.desu.ltd
@ -104,6 +106,8 @@
          value: vm-general-1.ashburn.mgmt.desu.ltd
        - record: netbox.desu.ltd
          value: vm-general-1.ashburn.mgmt.desu.ltd
+        - record: prometheus.desu.ltd
+          value: vm-general-1.ashburn.mgmt.desu.ltd
        # Public media stuff
        - record: prowlarr.media.desu.ltd
          value: vm-general-1.ashburn.mgmt.desu.ltd
--- a/playbooks/prod_web.yml
+++ b/playbooks/prod_web.yml
@ -16,20 +16,25 @@
    - name: include tasks for applications
      include_tasks: tasks/{{ item }}
      with_items:
+        # Applications
        - app/gitlab-runner.yml
        - app/redis.yml
+        # Frontend web services
        - web/9iron.yml
        - web/desultd.yml
        - web/element-web.yml
        - web/gitea.yml
+        - web/grafana.yml
        - web/netbox.yml
        - web/nextcloud.yml
+        - web/synapse.yml
+        # Backend web services
        - web/prowlarr.yml
        - web/radarr.yml
        - web/sonarr.yml
        - web/srv.yml
-        - web/synapse.yml
        - web/transmission.yml
+        # Games
        - game/factorio.yml
        - game/minecraft-createfarming.yml
        - game/minecraft-direwolf20.yml
@ -53,159 +58,26 @@
          - repo: https://git.desu.ltd/salt/gitea-custom
            dest: /data/gitea/data/gitea/custom
      tags: [ web, git ]
+    - role: prometheus
+      tags: [ prometheus, monitoring ]
    - role: nagios
      vars:
+        # Definitions for contacts and checks are defined in inventory vars
+        # See group_vars/all.yml if you need to change those
        nagios_matrix_server: "https://matrix.desu.ltd"
        nagios_matrix_room: "!NWNCKlNmOTcarMcMIh:desu.ltd"
        nagios_matrix_token: "{{ secret_nagios_matrix_token }}"
        nagios_data_dir: /data/nagios
        nagios_admin_pass: "{{ secret_nagios_admin_pass }}"
-        nagios_contacts:
-          - name: matrix
-            host_notification_commands: notify-host-by-matrix
-            service_notification_commands: notify-service-by-matrix
-            host_notification_period: ansible-not-late-at-night
-            service_notification_period: ansible-not-late-at-night
-            extra:
-              - key: contactgroups
-                value: ansible
-          - name: salt
-            host_notification_commands: notify-host-by-email
-            service_notification_commands: notify-service-by-email
-            extra:
-              - key: email
-                value: alerts@babor.tech
-        nagios_commands:
-          # This command is included in the container image
-          - name: check_nrpe
-            command: "$USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$"
-          - name: check_by_ssh
-            command: "$USER1$/check_by_ssh -H $HOSTADDRESS$ -F /opt/nagios/etc/ssh_config -t 30 -q -i /opt/nagios/etc/id_ed25519 -l nagios-checker -C \"$ARG1$\""
-          - name: notify-host-by-matrix
-            command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\n$HOSTNAME$ is $HOSTSTATE$\\nAddress: $HOSTADDRESS$\\nInfo: $HOSTOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix"
-          - name: notify-service-by-matrix
-            command: "/usr/bin/printf \"%b\" \"$NOTIFICATIONTYPE$\\nService $HOSTALIAS$ - $SERVICEDESC$ is $SERVICESTATE$\\nInfo: $SERVICEOUTPUT$\\nDate/Time: $LONGDATETIME$\" | /opt/Custom-Nagios-Plugins/notify-by-matrix"
-        nagios_services:
-          # Agentless checks
-          - name: HTTP
-            command: check_http
-            hostgroup: tag-nagios-checkhttp
-          - name: HTTPS
-            command: check_http!--ssl
-            hostgroup: tag-nagios-checkhttp
-          - name: SSH
-            command: check_ssh
-          # check_by_ssh checks
-          - name: CPU Utilization
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_cpu_stats -w 75 -c 90
-          - name: DNS Resolution
-            command: check_by_ssh!/usr/lib/nagios/plugins/check_etc_resolv
-          - name: Executables in tmp
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_executables_in_tmp
-          - name: Last Ansible Play
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_file_age /var/lib/ansible-last-run -w 432000 -c 604800
-          - name: Memory Usage
-            command: check_by_ssh!/usr/lib/nagios/plugins/check_memory -w 10% -c 5%
-            hostgroup: "ansible,!tag-prov-zfs"
-          - name: Ping Self over DNS
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_ping_by_hostname
-          - name: Reboot Required
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_reboot_required
-          - name: Unit atd.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit atd.service
-          - name: Unit backup.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.service
-            hostgroup: "ansible,!role-hypervisor"
-          - name: Unit backup.timer
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit backup.timer
-            hostgroup: "ansible,!role-hypervisor"
-          - name: Unit cron.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit cron.service
-          - name: Unit dbus.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit dbus.service
-          - name: Unit ssh.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit ssh.service
-          - name: Unit systemd-resolved.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit systemd-resolved.service
-            hostgroup: "ansible,!role-hypervisor"
-          - name: Users
-            command: check_by_ssh!/usr/lib/nagios/plugins/check_users -w 3 -c 5
-          # Privileged checks
-          # Required because check_disk may attempt to get the free space of
-          # restricted mountpoints
-          - name: Disk Usage
-            command: check_by_ssh!/usr/bin/sudo /usr/lib/nagios/plugins/check_disk -M -u GB -X nfs -X tracefs -X cgroup -X tmpfs -X overlay -X shm -w 15% -c 10% -W 15% -K 10% -A -I '^/run/' -I '^udev$' -I '^/var/lib/kubelet/' -I '^/tmp/.mount_' -I '^/dev/loop'
-          # Device type checks
-          # R720
-          - name: CPU0 Temperature
-            command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0000
-            hostgroup: device-type-r720
-          - name: CPU1 Temperature
-            command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor coretemp-isa-0001
-            hostgroup: device-type-r720
-          # Pi 4 4G
-          - name: CPU Temperature
-            command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_temp -n -w 65 -c 75 --sensor cpu_thermal-virtual-0
-            hostgroup: device-type-pi4b-2g,device-type-pi4b-4g,device-type-pi4b-4g-storage
-          # Device role checks
-          # hypervisor (which is assumed to be Proxmox)
-          - name: PVE Unit pve-firewall.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-firewall.service
-            hostgroup: role-hypervisor
-          - name: PVE Unit spiceproxy.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit spiceproxy.service
-            hostgroup: role-hypervisor
-          - name: PVE Unit pve-ha-crm.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-ha-crm.service
-            hostgroup: role-hypervisor
-          - name: PVE Unit pvedaemon.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvedaemon.service
-            hostgroup: role-hypervisor
-          - name: PVE Unit pvefw-logger.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvefw-logger.service
-            hostgroup: role-hypervisor
-          - name: PVE Unit pveproxy.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pveproxy.service
-            hostgroup: role-hypervisor
-          - name: PVE Unit pve-cluster.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pve-cluster.service
-            hostgroup: role-hypervisor
-          - name: PVE Unit pvestatd.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit pvestatd.service
-            hostgroup: role-hypervisor
-          # Tag-specific checks
-          # docker
-          - name: Unit docker.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit docker.service
-            hostgroup: "ansible,!tag-no-docker"
-          - name: Docker Status
-            command: check_by_ssh!/usr/bin/sudo /usr/local/bin/monitoring-scripts/check_docker --no-ok --status running
-            hostgroup: tag-nagios-checkdocker
-          # nagios-checkpgsql
-          - name: PSQL
-            command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5"
-            hostgroup: tag-nagios-checkpgsql
-          - name: PSQL Connections
-            command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select (select count(*)::float used from pg_stat_activity) / (select setting::int max_conn from pg_settings where name=\\$\\$max_connections\\$\\$)' -W 0.7-0.8 -C 0.8-1.0"
-            hostgroup: tag-nagios-checkpgsql
-            # https://rhaas.blogspot.com/2020/02/useless-vacuuming.html
-          - name: PSQL Old Xacts
-            command: "check_by_ssh!/usr/lib/nagios/plugins/check_pgsql -H localhost -l nagios -p {{ secret_postgresql_monitoring_password }} -w 2 -c 5 -q 'select count(*)::float from pg_prepared_xacts where age(transaction) > 5000000' -W 500-1000 -C 1000-1000000"
-            hostgroup: tag-nagios-checkpgsql
-          - name: Unit postgresql.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit postgresql.service
-            hostgroup: tag-nagios-checkpgsql
-          # nagios-checkswap
-          - name: Swap Usage
-            command: check_by_ssh!/usr/lib/nagios/plugins/check_swap -w 20% -c 10%
-            hostgroup: tag-nagios-checkswap
-          # zerotier
-          - name: Unit zerotier-one.service
-            command: check_by_ssh!/usr/local/bin/monitoring-scripts/check_systemd_unit zerotier-one.service
-            hostgroup: tag-zt-personal
      tags: [ nagios, no-auto ]
    - role: ingress
      vars:
+        ingress_head: |
+          # Used by Grafana, required for its API or some shit
+          map $http_upgrade $connection_upgrade {
+            default upgrade;
+            '' close;
+          }
        ingress_servers:
          # desu.ltd
          - name: desu.ltd
@ -221,6 +93,16 @@
                  return 200 '{"m.homeserver":{"base_url":"https://matrix.desu.ltd"}}';
          - name: git.desu.ltd
            proxy_pass: http://gitea:3000
+          - name: grafana.desu.ltd
+            proxy_pass: http://grafana:3000
+            locations:
+              - location: "/api/live/"
+                contents: |
+                      proxy_http_version 1.1;
+                      proxy_set_header Upgrade $http_upgrade;
+                      proxy_set_header Connection $connection_upgrade;
+                      proxy_set_header Host $host;
+                      proxy_pass http://grafana:3000;
          - name: matrix.desu.ltd
            proxies:
              - location: "~* ^(\/_matrix|\/_synapse|\/client|\/health)"
@ -245,6 +127,16 @@
                  try_files $uri $uri/ =404;
          - name: netbox.desu.ltd
            proxy_pass: http://netbox:8080
+          - name: prometheus.desu.ltd
+            directives:
+              - "allow {{ common_home_address }}/{{ common_home_address_mask }}"
+              - "allow 10.0.0.0/8"
+              - "allow 172.16.0.0/12"
+              - "allow 192.168.0.0/16"
+              # TODO: Replace this with a dynamically-generated list of public IPs from inv
+              - "allow 45.79.58.44/32"  # bastion1.dallas.mgmt.desu.ltd
+              - "deny all"
+            proxy_pass: http://prometheus:9090
          # desu.ltd media bullshit
          - name: prowlarr.media.desu.ltd
            directives:
--- a/playbooks/tags_nagios.yml
+++ b/playbooks/tags_nagios.yml
@ -35,6 +35,32 @@
        - /usr/local/bin/monitoring-scripts/check_docker
        - /usr/local/bin/monitoring-scripts/check_temp
      tags: [ nagios, sudo ]
+    - name: assure prometheus node exporter
+      # https://github.com/prometheus/node_exporter
+      ansible.builtin.docker_container:
+        name: prometheus-node-exporter
+        image: quay.io/prometheus/node-exporter:latest
+        command:
+          - '--path.rootfs=/host'
+        network_mode: host
+        pid_mode: host
+        volumes:
+          - /:/host:ro,rslave
+      tags: [ prometheus ]
+    - name: assure prometheus cadvisor exporter
+      ansible.builtin.docker_container:
+        name: prometheus-cadvisor-exporter
+        image: gcr.io/cadvisor/cadvisor:latest
+        ports:
+          - 9101:8080/tcp
+        volumes:
+          - /:/rootfs:ro
+          - /var/run:/var/run:ro
+          - /sys:/sys:ro
+          - /var/lib/docker:/var/lib/docker:ro
+          - /dev/disk:/dev/disk:ro
+        devices:
+          - /dev/kmsg
 - hosts: all
  gather_facts: no
  tasks:
--- a/playbooks/tasks/web/grafana.yml
+++ b/playbooks/tasks/web/grafana.yml
@ -0,0 +1,30 @@
+# vim:ft=ansible:
+- name: ensure grafana dirs
+  ansible.builtin.file:
+    state: directory
+    owner: 472
+    group: 472
+    mode: "0750"
+    path: "{{ item }}"
+  with_items:
+    - /data/grafana/storage
+    - /data/grafana/logs
+  tags: [ docker, grafana, monitoring ]
+- name: docker deploy grafana
+  docker_container:
+    name: grafana
+    image: grafana/grafana-oss:main
+    env:
+      TZ: "America/Chicago"
+      # This enables logging to STDOUT for log aggregators to more easily hook it
+      GF_LOG_MODE: "console file"
+      GF_SERVER_DOMAIN: "grafana.desu.ltd"
+      GF_SERVER_PROTOCOL: "http"
+      GF_SERVER_ROOT_URL: "https://grafana.desu.ltd"
+    networks:
+      - name: web
+        aliases: [ "grafana" ]
+    volumes:
+      - /data/grafana/storage:/var/lib/grafana
+      - /data/grafana/logs:/var/log/grafana
+  tags: [ docker, grafana, monitoring ]
--- a/roles/ingress/templates/vhosts.conf.j2
+++ b/roles/ingress/templates/vhosts.conf.j2
@ -1,3 +1,7 @@
+{% if ingress_head is defined %}
+{{ ingress_head }}
+{% endif %}
+
 {% for server in ingress_servers %}
 server {
 {% if loop.index == 1 %}
--- a/roles/prometheus/handlers/main.yml
+++ b/roles/prometheus/handlers/main.yml
@ -0,0 +1,5 @@
+#!/usr/bin/env ansible-playbook
+# vim:ft=ansible:
+- name: restart prometheus container
+  docker_container: name="prometheus" state=started restart=yes
+  become: yes
--- a/roles/prometheus/tasks/main.yml
+++ b/roles/prometheus/tasks/main.yml
@ -0,0 +1,33 @@
+# vim:ft=ansible:
+- name: ensure prometheus dirs
+  ansible.builtin.file:
+    state: directory
+    owner: 5476
+    group: 5476
+    mode: "0750"
+    path: "{{ item }}"
+  with_items:
+    - /data/prometheus/config
+    - /data/prometheus/data
+  notify: restart prometheus container
+- name: template out configuration file
+  ansible.builtin.template:
+    src: prometheus.yml.j2
+    owner: 5476
+    group: 5476
+    mode: "0640"
+    dest: /data/prometheus/config/prometheus.yml
+  notify: restart prometheus container
+- name: docker deploy prometheus
+  community.docker.docker_container:
+    name: prometheus
+    image: prom/prometheus:latest
+    user: 5476:5476
+    env:
+      TZ: "America/Chicago"
+    networks:
+      - name: web
+        aliases: [ "prometheus" ]
+    volumes:
+      - /data/prometheus/config:/etc/prometheus
+      - /data/prometheus/data:/prometheus
--- a/roles/prometheus/templates/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus.yml.j2
@ -0,0 +1,26 @@
+# my global config
+---
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+scrape_configs:
+  # The job name is added as a label `job=<job_name>` to any timeseries
+  # scraped from this config.
+  - job_name: "prometheus"
+    # metrics_path defaults to '/metrics'
+    # scheme defaults to 'http'.
+    static_configs:
+      - targets: ["localhost:9090"]
+  - job_name: "node-exporter"
+    static_configs:
+      - targets:
+{% for host in groups['tags_nagios'] %}
+          - '{{ host }}:9100'
+{% endfor %}
+  - job_name: "cadvisor-exporter"
+    static_configs:
+      - targets:
+{% for host in groups['tags_nagios'] %}
+          - '{{ host }}:9101'
+{% endfor %}
Author	SHA1	Message	Date
Jacob Babor	ca9882adde	Move Prometheus master to its own container and deploy some scraping on each node	2024-07-09 15:43:41 -05:00
Jacob Babor	e63898f328	Add Prometheus	2024-07-09 14:53:38 -05:00
Jacob Babor	12f187e1e2	Add Grafana	2024-07-09 14:27:54 -05:00
Jacob Babor	9cad3d4867	Move nagios defs to inventory	2024-07-09 13:28:48 -05:00