Add more details to check_reboot_required, adding packages to the output string

Add check_docker
2021-09-14 17:20:59 -05:00 · 2021-09-07 15:16:31 -05:00
2 changed files with 1006 additions and 2 deletions
@@ -0,0 +1,996 @@
+#!/usr/bin/env python3
+# logging.basicConfig(level=logging.DEBUG)
+import argparse
+import json
+import logging
+import math
+import os
+import re
+import socket
+import stat
+import traceback
+from collections import deque, namedtuple, UserDict, defaultdict
+from concurrent import futures
+from datetime import datetime, timezone
+from functools import lru_cache
+from http.client import HTTPConnection
+from sys import argv
+from urllib import request
+from urllib.error import HTTPError, URLError
+from urllib.request import AbstractHTTPHandler, HTTPHandler, HTTPSHandler, OpenerDirector, HTTPRedirectHandler, \
+    Request, HTTPBasicAuthHandler
+
+logger = logging.getLogger()
+__author__ = 'Tim Laurence'
+__copyright__ = "Copyright 2019"
+__credits__ = ['Tim Laurence']
+__license__ = "GPL"
+__version__ = "2.2.2"
+
+'''
+nrpe compatible check for docker containers.
+
+Requires Python 3
+
+Note: I really would have preferred to have used requests for all the network connections but that would have added a
+dependency.
+'''
+
+DEFAULT_SOCKET = '/var/run/docker.sock'
+DEFAULT_TIMEOUT = 10.0
+DEFAULT_PORT = 2375
+DEFAULT_MEMORY_UNITS = 'B'
+DEFAULT_HEADERS = [('Accept', 'application/vnd.docker.distribution.manifest.v2+json')]
+DEFAULT_PUBLIC_REGISTRY = 'registry-1.docker.io'
+
+# The second value is the power to raise the base to.
+UNIT_ADJUSTMENTS_TEMPLATE = {
+    '%': 0,
+    'B': 0,
+    'KB': 1,
+    'MB': 2,
+    'GB': 3,
+    'TB': 4
+}
+unit_adjustments = None
+
+# Reduce message to a single OK unless a checks fail.
+no_ok = False
+
+# Suppress performance data reporting
+no_performance = False
+
+OK_RC = 0
+WARNING_RC = 1
+CRITICAL_RC = 2
+UNKNOWN_RC = 3
+
+# These hold the final results
+rc = -1
+messages = []
+performance_data = []
+
+ImageName = namedtuple('ImageName', "registry name tag full_name")
+
+
+class ThresholdSpec(UserDict):
+    def __init__(self, warn, crit, units=''):
+        super().__init__(warn=warn, crit=crit, units=units)
+
+    def __getattr__(self, item):
+        if item in ('warn', 'crit', 'units'):
+            return self.data[item]
+        return super().__getattr__(item)
+
+
+# How much threading can we do? We are generally not CPU bound so I am using this a worse case cap
+DEFAULT_PARALLELISM = 10
+
+# Holds list of all threads
+threads = []
+
+# This is used during testing
+DISABLE_THREADING = False
+
+
+# Hacked up urllib to handle sockets
+#############################################################################################
+# Docker runs a http connection over a socket. http.client is knows how to deal with these
+# but lacks some niceties. Urllib wraps that and makes up for some of the deficiencies but
+# cannot fix the fact http.client can't read from socket files. In order to take advantage of
+# urllib and http.client's  capabilities the class below tweaks HttpConnection and passes it
+# to urllib registering for socket:// connections
+
+# This is all side effect so excluding coverage
+class SocketFileHandler(AbstractHTTPHandler):
+    class SocketFileToHttpConnectionAdaptor(HTTPConnection): # pragma: no cover
+        def __init__(self, socket_file, timeout=DEFAULT_TIMEOUT):
+            super().__init__(host='', port=0, timeout=timeout)
+            self.socket_file = socket_file
+
+        def connect(self):
+            self.sock = socket.socket(family=socket.AF_UNIX, type=socket.SOCK_STREAM, proto=0, fileno=None)
+            self.sock.settimeout(self.timeout)
+            self.sock.connect(self.socket_file)
+
+    def socket_open(self, req):
+        socket_file, path = req.selector.split(':', 1)
+        req.host = socket_file
+        req.selector = path
+        return self.do_open(self.SocketFileToHttpConnectionAdaptor, req)
+
+
+# Tokens are not cached because I expect the callers to cache the responses
+class Oauth2TokenAuthHandler(HTTPBasicAuthHandler):
+    auth_failure_tracker = defaultdict(int)
+
+    def http_response(self, request, response):
+        code, hdrs = response.code, response.headers
+
+        www_authenticate_header = response.headers.get('www-authenticate', None)
+        if code == 401 and www_authenticate_header:
+            scheme = www_authenticate_header.split()[0]
+            if scheme.lower() == 'bearer':
+                return self.process_oauth2(request, response, www_authenticate_header)
+
+        return response
+
+    https_response = http_response
+
+    @staticmethod
+    def _get_outh2_token(www_authenticate_header):
+        auth_fields = dict(re.findall(r"""(?:(?P<key>[^ ,=]+)="([^"]+)")""", www_authenticate_header))
+
+        auth_url = "{realm}?scope={scope}&service={service}".format(
+            realm=auth_fields['realm'],
+            scope=auth_fields['scope'],
+            service=auth_fields['service'],
+        )
+        token_request = Request(auth_url)
+        token_request.add_header("Content-Type", "application/x-www-form-urlencoded; charset=utf-8")
+        token_response = request.urlopen(token_request)
+        return process_urllib_response(token_response)['token']
+
+    def process_oauth2(self, request, response, www_authenticate_header):
+
+        # This keeps infinite auth loops from happening
+        full_url = request.full_url
+        self.auth_failure_tracker[full_url] += 1
+        if self.auth_failure_tracker[full_url] > 1:
+            raise HTTPError(full_url, 401, "Stopping Oauth2 failure loop for {}".format(full_url),
+                            response.headers, response)
+
+        auth_token = self._get_outh2_token(www_authenticate_header)
+
+        request.add_unredirected_header('Authorization', 'Bearer ' + auth_token)
+        return self.parent.open(request, timeout=request.timeout)
+
+
+better_urllib_get = OpenerDirector()
+better_urllib_get.addheaders = DEFAULT_HEADERS.copy()
+better_urllib_get.add_handler(HTTPHandler())
+better_urllib_get.add_handler(HTTPSHandler())
+better_urllib_get.add_handler(HTTPRedirectHandler())
+better_urllib_get.add_handler(SocketFileHandler())
+better_urllib_get.add_handler(Oauth2TokenAuthHandler())
+
+
+class RegistryError(Exception):
+    def __init__(self, response):
+        self.response_obj = response
+
+
+# Util functions
+#############################################################################################
+def parse_thresholds(spec, include_units=True, units_required=True):
+    """
+    Given a spec string break it up into ':' separated chunks. Convert strings to ints as it makes sense
+
+    :param spec: The threshold specification being parsed
+    :param include_units: Specifies that units should be processed and returned if present
+    :param units_required: Mark spec as invalid if the units are missing.
+    :return: A list containing the thresholds in order of warn, crit, and units(if included and present)
+    """
+    parts = deque(spec.split(':'))
+    if not all(parts):
+        raise ValueError("Blanks are not allowed in a threshold specification: {}".format(spec))
+
+    # Warn
+    warn = int(parts.popleft())
+    # Crit
+    crit = int(parts.popleft())
+
+    units = ''
+    if include_units:
+        if len(parts):
+            # units
+            units = parts.popleft()
+        elif units_required:
+            raise ValueError("Missing units in {}".format(spec))
+
+    if len(parts) != 0:
+        raise ValueError("Too many threshold specifiers in {}".format(spec))
+
+    return ThresholdSpec(warn=warn, crit=crit, units=units)
+
+
+def pretty_time(seconds):
+    remainder = seconds
+    result = []
+    if remainder > 24 * 60 * 60:
+        days, remainder = divmod(remainder, 24 * 60 * 60)
+        result.append("{}d".format(int(days)))
+    if remainder > 60 * 60:
+        hours, remainder = divmod(remainder, 60 * 60)
+        result.append("{}h".format(int(hours)))
+    if remainder > 60:
+        minutes, remainder = divmod(remainder, 60)
+        result.append("{}min".format(int(minutes)))
+    result.append("{}s".format(int(remainder)))
+    return result
+
+
+def evaluate_numeric_thresholds(container, value, thresholds, name, short_name,
+                                min=None, max=None, greater_than=True):
+    rounder = lambda x: round(x, 2)
+
+    INTEGER_UNITS = ['B', '%', '']
+
+    # Some units don't have decimal places
+    rounded_value = int(value) if thresholds.units in INTEGER_UNITS else rounder(value)
+
+    perf_string = "{container}_{short_name}={value}{units};{warn};{crit}".format(
+        container=container,
+        short_name=short_name,
+        value=rounded_value,
+        **thresholds)
+    if min is not None:
+        rounded_min = math.floor(min) if thresholds.units in INTEGER_UNITS else rounder(min)
+        perf_string += ';{}'.format(rounded_min)
+        if max is not None:
+            rounded_max = math.ceil(max) if thresholds.units in INTEGER_UNITS else rounder(max)
+            perf_string += ';{}'.format(rounded_max)
+
+    global performance_data
+    performance_data.append(perf_string)
+
+    if thresholds.units == 's':
+        nice_time = ' '.join(pretty_time(rounded_value)[:2])
+        results_str = "{} {} is {}".format(container, name, nice_time)
+    else:
+        results_str = "{} {} is {}{}".format(container, name, rounded_value, thresholds.units)
+
+    if greater_than:
+        comparator = lambda value, threshold: value >= threshold
+    else:
+        comparator = lambda value, threshold: value <= threshold
+
+    if comparator(value, thresholds.crit):
+        critical(results_str)
+    elif comparator(value, thresholds.warn):
+        warning(results_str)
+    else:
+        ok(results_str)
+
+
+@lru_cache(maxsize=None)
+def get_url(url):
+    logger.debug("get_url: {}".format(url))
+    response = better_urllib_get.open(url, timeout=timeout)
+    logger.debug("get_url: {} {}".format(url, response.status))
+    return process_urllib_response(response), response.status
+
+
+def process_urllib_response(response):
+    response_bytes = response.read()
+    body = response_bytes.decode('utf-8')
+    # logger.debug("BODY: {}".format(body))
+    return json.loads(body)
+
+
+def get_container_info(name):
+    content, _ = get_url(daemon + '/containers/{container}/json'.format(container=name))
+    return content
+
+
+def get_image_info(name):
+    content, _ = get_url(daemon + '/images/{image}/json'.format(image=name))
+    return content
+
+
+def get_state(container):
+    return get_container_info(container)['State']
+
+
+def get_stats(container):
+    content, _ = get_url(daemon + '/containers/{container}/stats?stream=0'.format(container=container))
+    return content
+
+
+def get_ps_name(name_list):
+    # Pick the name that starts with a '/' but doesn't contain a '/' and return that value
+    for name in name_list:
+        if '/' not in name[1:] and name[0] == '/':
+            return name[1:]
+    else:
+        raise NameError("Error when trying to identify 'ps' name in {}".format(name_list))
+
+
+def get_containers(names, require_present):
+    containers_list, _ = get_url(daemon + '/containers/json?all=1')
+
+    all_container_names = set(get_ps_name(x['Names']) for x in containers_list)
+
+    if 'all' in names:
+        return all_container_names
+
+    filtered = set()
+    for matcher in names:
+        found = False
+        for candidate in all_container_names:
+            if re.match("^{}$".format(matcher), candidate):
+                filtered.add(candidate)
+                found = True
+        # If we don't find a container that matches out regex
+        if require_present and not found:
+            critical("No containers match {}".format(matcher))
+
+    return filtered
+
+
+def get_container_image_id(container):
+    # find registry and tag
+    inspection = get_container_info(container)
+    return inspection['Image']
+
+
+def get_container_image_urls(container):
+    inspection = get_container_info(container)
+    image_id = inspection['Image']
+    image_info = get_image_info(image_id)
+    return image_info['RepoTags']
+
+
+def normalize_image_name_to_manifest_url(image_name, insecure_registries):
+    parsed_url = parse_image_name(image_name)
+
+    lower_insecure = [reg.lower() for reg in insecure_registries]
+
+    # Registry query url
+    scheme = 'http' if parsed_url.registry.lower() in lower_insecure else 'https'
+    url = '{scheme}://{registry}/v2/{image_name}/manifests/{image_tag}'.format(scheme=scheme,
+                                                                               registry=parsed_url.registry,
+                                                                               image_name=parsed_url.name,
+                                                                               image_tag=parsed_url.tag)
+    return url, parsed_url.registry
+
+
+# Auth servers seem picky about being hit too hard. Can't figure out why. ;)
+# As result it is best to single thread this check
+# This is based on https://docs.docker.com/registry/spec/auth/token/#requesting-a-token
+def get_digest_from_registry(url):
+    logger.debug("get_digest_from_registry")
+    # query registry
+    # TODO: Handle logging in if needed
+    registry_info, status_code = get_url(url=url)
+
+    if status_code != 200:
+        raise RegistryError(response=registry_info)
+    return registry_info['config'].get('digest', None)
+
+
+def set_rc(new_rc):
+    global rc
+    rc = new_rc if new_rc > rc else rc
+
+
+def ok(message):
+    set_rc(OK_RC)
+    messages.append('OK: ' + message)
+
+
+def warning(message):
+    set_rc(WARNING_RC)
+    messages.append('WARNING: ' + message)
+
+
+def critical(message):
+    set_rc(CRITICAL_RC)
+    messages.append('CRITICAL: ' + message)
+
+
+def unknown(message):
+    set_rc(UNKNOWN_RC)
+    messages.append('UNKNOWN: ' + message)
+
+
+def require_running(name):
+    def inner_decorator(func):
+        def wrapper(container, *args, **kwargs):
+            container_state = get_state(container)
+            state = normalize_state(container_state)
+            if state.lower() == "running":
+                func(container, *args, **kwargs)
+            else:
+                # container is not running, can't perform check
+                critical('{container} is not "running", cannot check {check}"'.format(container=container,
+                                                                                      check=name))
+
+        return wrapper
+
+    return inner_decorator
+
+
+def multithread_execution(disable_threading=DISABLE_THREADING):
+    def inner_decorator(func):
+        def wrapper(container, *args, **kwargs):
+            if DISABLE_THREADING:
+                func(container, *args, **kwargs)
+            else:
+                threads.append(parallel_executor.submit(func, container, *args, **kwargs))
+
+        return wrapper
+
+    return inner_decorator
+
+
+def singlethread_execution(disable_threading=DISABLE_THREADING):
+    def inner_decorator(func):
+        def wrapper(container, *args, **kwargs):
+            if DISABLE_THREADING:
+                func(container, *args, **kwargs)
+            else:
+                threads.append(serial_executor.submit(func, container, *args, **kwargs))
+
+        return wrapper
+
+    return inner_decorator
+
+
+def parse_image_name(image_name):
+    """
+    Parses image names into their constituent parts.
+    :param image_name:
+    :return: ImageName
+    """
+
+    # These are based on information found here
+    #   https://docs.docker.com/engine/reference/commandline/tag/#extended-description
+    #   https://github.com/docker/distribution/blob/master/reference/regexp.go
+    host_segment_re = '[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?'
+    hostname_re = r'({host_segment}\.)+{host_segment}'.format(host_segment=host_segment_re)
+    registry_re = r'((?P<registry>({hostname_re}(:\d+)?|{host_segment_re}:\d+))/)'.format(
+        host_segment_re=host_segment_re, hostname_re=hostname_re)
+    name_component_ends_re = '[a-z0-9]'
+    name_component_middle_re = '[a-z0-9._-]'  # Ignoring spec limit of two _
+    name_component_re = '({end}{middle}*{end}|{end})'.format(end=name_component_ends_re,
+                                                             middle=name_component_middle_re)
+    image_name_re = "(?P<image_name>({name_component}/)*{name_component})".format(name_component=name_component_re)
+    image_tag_re = '(?P<image_tag>[a-zA-Z0-9_][a-zA-Z0-9_.-]*)'
+    full_re = '^{registry}?{image_name}(:{image_tag})?$'.format(registry=registry_re, image_name=image_name_re,
+                                                                image_tag=image_tag_re)
+    parsed = re.match(full_re, image_name)
+
+    registry = parsed.group('registry') if parsed.group('registry') else DEFAULT_PUBLIC_REGISTRY
+
+    image_name = parsed.group('image_name')
+    image_name = image_name if '/' in image_name or registry != DEFAULT_PUBLIC_REGISTRY else 'library/' + image_name
+
+    image_tag = parsed.group('image_tag')
+    image_tag = image_tag if image_tag else 'latest'
+
+    full_image_name = "{registry}/{image_name}:{image_tag}".format(
+        registry=registry,
+        image_name=image_name,
+        image_tag=image_tag)
+
+    return ImageName(registry=registry, name=image_name, tag=image_tag, full_name=full_image_name)
+
+
+def normalize_state(status_info):
+    # Ugh, docker used to report state in as silly way then they figured out how to do it better.
+    # This tries the simpler new way and if that doesn't work fails back to the old way
+
+    # On new docker engines the status holds whatever the current state is, running, stopped, paused, etc.
+    if "Status" in status_info:
+        return status_info['Status']
+
+    status = 'Exited'
+    if status_info["Restarting"]:
+        status = 'Restarting'
+    elif status_info["Paused"]:
+        status = 'Paused'
+    elif status_info["Dead"]:
+        status = 'Dead'
+    elif status_info["Running"]:
+        return "Running"
+    return status
+
+
+# Checks
+#############################################################################################
+
+@multithread_execution()
+@require_running(name='memory')
+def check_memory(container, thresholds):
+    if not thresholds.units in unit_adjustments:
+        unknown("Memory units must be one of  {}".format(list(unit_adjustments.keys())))
+        return
+
+    inspection = get_stats(container)
+
+    # Subtracting cache to match what `docker stats` does.
+    adjusted_usage = inspection['memory_stats']['usage'] - inspection['memory_stats']['stats']['total_cache']
+    if thresholds.units == '%':
+        max = 100
+        usage = int(100 * adjusted_usage / inspection['memory_stats']['limit'])
+    else:
+        max = inspection['memory_stats']['limit'] / unit_adjustments[thresholds.units]
+        usage = adjusted_usage / unit_adjustments[thresholds.units]
+
+    evaluate_numeric_thresholds(container=container, value=usage, thresholds=thresholds, name='memory',
+                                short_name='mem', min=0, max=max)
+
+
+@multithread_execution()
+def check_status(container, desired_state):
+    normized_desired_state = desired_state.lower()
+    normalized_state = normalize_state(get_state(container)).lower()
+    if normized_desired_state != normalized_state:
+        critical("{} state is not {}".format(container, desired_state))
+        return
+    ok("{} status is {}".format(container, desired_state))
+
+
+@multithread_execution()
+@require_running('health')
+def check_health(container):
+    state = get_state(container)
+    if "Health" in state and "Status" in state["Health"]:
+        health = state["Health"]["Status"]
+        message = "{} is {}".format(container, health)
+        if health == 'healthy':
+            ok(message)
+        elif health == 'unhealthy':
+            critical(message)
+        else:
+            unknown(message)
+    else:
+        unknown('{} has no health check data'.format(container))
+
+
+@multithread_execution()
+@require_running('uptime')
+def check_uptime(container, thresholds):
+    inspection = get_container_info(container)['State']['StartedAt']
+    only_secs = inspection[0:19]
+    start = datetime.strptime(only_secs, "%Y-%m-%dT%H:%M:%S")
+    start = start.replace(tzinfo=timezone.utc)
+    now = datetime.now(timezone.utc)
+    uptime = (now - start).total_seconds()
+
+    graph_padding = 2
+    thresholds.units = 's'
+    evaluate_numeric_thresholds(container=container, value=uptime, thresholds=thresholds, name='uptime',
+                                short_name='up', min=0, max=graph_padding, greater_than=False)
+
+
+@multithread_execution()
+def check_image_age(container, thresholds):
+    container_image = get_container_info(container)['Image']
+    image_created = get_image_info(container_image)['Created']
+    only_secs = image_created[0:19]
+    start = datetime.strptime(only_secs, "%Y-%m-%dT%H:%M:%S")
+    start = start.replace(tzinfo=timezone.utc)
+    now = datetime.now(timezone.utc)
+    image_age = (now - start).days
+
+    graph_padding = 2
+    thresholds.units = 'd'
+    evaluate_numeric_thresholds(container=container, value=image_age, thresholds=thresholds, name='image_age',
+                                short_name='age', min=0, max=graph_padding, greater_than=True)
+
+
+@multithread_execution()
+@require_running('restarts')
+def check_restarts(container, thresholds):
+    inspection = get_container_info(container)
+
+    restarts = int(inspection['RestartCount'])
+    graph_padding = 2
+    evaluate_numeric_thresholds(container=container, value=restarts, thresholds=thresholds, name='restarts',
+                                short_name='re', min=0, max=graph_padding)
+
+
+@singlethread_execution()
+def check_version(container, insecure_registries):
+    image_id = get_container_image_id(container)
+    logger.debug("Local container image ID: {}".format(image_id))
+    if image_id is None:
+        unknown('Checksum missing for "{}", try doing a pull'.format(container))
+        return
+
+    image_urls = get_container_image_urls(container=container)
+    if len(image_urls) > 1:
+        unknown('"{}" has multiple tags/names. Unsure which one to use to check the version.'.format(container))
+        return
+    elif len(image_urls) == 0:
+        unknown('"{}" has last no repository tag. Is this anywhere else?'.format(container))
+        return
+
+    url, registry = normalize_image_name_to_manifest_url(image_urls[0], insecure_registries)
+    logger.debug("Looking up image digest here {}".format(url))
+    try:
+        registry_hash = get_digest_from_registry(url)
+    except URLError as e:
+        if hasattr(e.reason, 'reason') and e.reason.reason == 'UNKNOWN_PROTOCOL':
+            unknown(
+                "TLS error connecting to registry {} for {}, should you use the '--insecure-registry' flag?" \
+                    .format(registry, container))
+            return
+        elif hasattr(e.reason, 'strerror') and e.reason.strerror == 'nodename nor servname provided, or not known':
+            unknown(
+                "Cannot reach registry for {} at {}".format(container, url))
+            return
+        else:
+            raise e
+    except RegistryError as e:
+        unknown("Cannot check version, couldn't retrieve digest for {} while checking {}.".format(container, url))
+        return
+    logger.debug("Image digests, local={} remote={}".format(image_id, registry_hash))
+    if registry_hash == image_id:
+        ok("{}'s version matches registry".format(container))
+        return
+    critical("{}'s version does not match registry".format(container))
+
+
+def calculate_cpu_capacity_precentage(info, stats):
+    host_config = info['HostConfig']
+
+    if 'online_cpus' in stats['cpu_stats']:
+        num_cpus = stats['cpu_stats']['online_cpus']
+    else:
+        num_cpus = len(stats['cpu_stats']['cpu_usage']['percpu_usage'])
+
+    # Identify limit system being used
+    # --cpus
+    if 'NanoCpus' in host_config and host_config['NanoCpus'] != 0:
+        period = 1000000000
+        quota = host_config['NanoCpus']
+    # --cpu-quota
+    elif 'CpuQuota' in host_config and host_config['CpuQuota'] != 0:
+        period = 100000 if host_config['CpuPeriod'] == 0 else host_config['CpuPeriod']
+        quota = host_config['CpuQuota']
+    # unlimited
+    else:
+        period = 1
+        quota = num_cpus
+
+    if period * num_cpus < quota:
+        # This handles the case where the quota is actually bigger than amount available by all the cpus.
+        available_limit_ratio = 1
+    else:
+        available_limit_ratio = (period * num_cpus) / quota
+
+    cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage']
+    system_delta = stats['cpu_stats']['system_cpu_usage'] - stats['precpu_stats']['system_cpu_usage']
+    usage = (cpu_delta / system_delta) * available_limit_ratio
+    usage = round(usage * 100, 0)
+    return usage
+
+
+@multithread_execution()
+@require_running('cpu')
+def check_cpu(container, thresholds):
+    info = get_container_info(container)
+
+    stats = get_stats(container=container)
+
+    usage = calculate_cpu_capacity_precentage(info=info, stats=stats)
+
+    max = 100
+    thresholds.units = '%'
+    evaluate_numeric_thresholds(container=container, value=usage, thresholds=thresholds, name='cpu', short_name='cpu',
+                                min=0, max=max)
+
+
+def process_args(args):
+    parser = argparse.ArgumentParser(description='Check docker containers.')
+
+    # Connect to local socket or ip address
+    connection_group = parser.add_mutually_exclusive_group()
+    connection_group.add_argument('--connection',
+                                  dest='connection',
+                                  action='store',
+                                  default=DEFAULT_SOCKET,
+                                  type=str,
+                                  metavar='[/<path to>/docker.socket|<ip/host address>:<port>]',
+                                  help='Where to find docker daemon socket. (default: %(default)s)')
+
+    connection_group.add_argument('--secure-connection',
+                                  dest='secure_connection',
+                                  action='store',
+                                  type=str,
+                                  metavar='[<ip/host address>:<port>]',
+                                  help='Where to find TLS protected docker daemon socket.')
+
+    base_group = parser.add_mutually_exclusive_group()
+    base_group.add_argument('--binary_units',
+                            dest='units_base',
+                            action='store_const',
+                            const=1024,
+                            help='Use a base of 1024 when doing calculations of KB, MB, GB, & TB (This is default)')
+
+    base_group.add_argument('--decimal_units',
+                            dest='units_base',
+                            action='store_const',
+                            const=1000,
+                            help='Use a base of 1000 when doing calculations of KB, MB, GB, & TB')
+    parser.set_defaults(units_base=1024)
+
+    # Connection timeout
+    parser.add_argument('--timeout',
+                        dest='timeout',
+                        action='store',
+                        type=float,
+                        default=DEFAULT_TIMEOUT,
+                        help='Connection timeout in seconds. (default: %(default)s)')
+
+    # Container name
+    parser.add_argument('--containers',
+                        dest='containers',
+                        action='store',
+                        nargs='+',
+                        type=str,
+                        default=['all'],
+                        help='One or more RegEx that match the names of the container(s) to check. If omitted all containers are checked. (default: %(default)s)')
+
+    # Container name
+    parser.add_argument('--present',
+                        dest='present',
+                        default=False,
+                        action='store_true',
+                        help='Modifies --containers so that each RegEx must match at least one container.')
+
+    # Threads
+    parser.add_argument('--threads',
+                        dest='threads',
+                        default=DEFAULT_PARALLELISM,
+                        action='store',
+                        type=int,
+                        help='This + 1 is the maximum number of concurent threads/network connections. (default: %(default)s)')
+
+    # CPU
+    parser.add_argument('--cpu',
+                        dest='cpu',
+                        action='store',
+                        type=str,
+                        metavar='WARN:CRIT',
+                        help='Check cpu usage percentage taking into account any limits.')
+
+    # Memory
+    parser.add_argument('--memory',
+                        dest='memory',
+                        action='store',
+                        type=str,
+                        metavar='WARN:CRIT:UNITS',
+                        help='Check memory usage taking into account any limits. Valid values for units are %%,B,KB,MB,GB.')
+
+    # State
+    parser.add_argument('--status',
+                        dest='status',
+                        action='store',
+                        type=str,
+                        help='Desired container status (running, exited, etc).')
+
+    # Health
+    parser.add_argument('--health',
+                        dest='health',
+                        default=None,
+                        action='store_true',
+                        help="Check container's health check status")
+
+    # Age
+    parser.add_argument('--uptime',
+                        dest='uptime',
+                        action='store',
+                        type=str,
+                        metavar='WARN:CRIT',
+                        help='Minimum container uptime in seconds. Use when infrequent crashes are tolerated.')
+
+    # Image Age
+    parser.add_argument('--image-age',
+                        dest='image_age',
+                        action='store',
+                        type=str,
+                        metavar='WARN:CRIT',
+                        help='Maximum image age in days.')
+
+    # Version
+    parser.add_argument('--version',
+                        dest='version',
+                        default=None,
+                        action='store_true',
+                        help='Check if the running images are the same version as those in the registry. Useful for finding stale images. Does not support login.')
+
+    # Version
+    parser.add_argument('--insecure-registries',
+                        dest='insecure_registries',
+                        action='store',
+                        nargs='+',
+                        type=str,
+                        default=[],
+                        help='List of registries to connect to with http(no TLS). Useful when using "--version" with images from insecure registries.')
+
+    # Restart
+    parser.add_argument('--restarts',
+                        dest='restarts',
+                        action='store',
+                        type=str,
+                        metavar='WARN:CRIT',
+                        help='Container restart thresholds.')
+
+    # no-ok
+    parser.add_argument('--no-ok',
+                        dest='no_ok',
+                        action='store_true',
+                        help='Make output terse suppressing OK messages. If all checks are OK return a single OK.')
+
+    # no-performance
+    parser.add_argument('--no-performance',
+                        dest='no_performance',
+                        action='store_true',
+                        help='Suppress performance data. Reduces output when performance data is not being used.')
+
+    parser.add_argument('-V', action='version', version='%(prog)s {}'.format(__version__))
+
+    if len(args) == 0:
+        parser.print_help()
+
+    parsed_args = parser.parse_args(args=args)
+
+    global timeout
+    timeout = parsed_args.timeout
+
+    global daemon
+    global connection_type
+    if parsed_args.secure_connection:
+        daemon = 'https://' + parsed_args.secure_connection
+        connection_type = 'https'
+    elif parsed_args.connection:
+        if parsed_args.connection[0] == '/':
+            daemon = 'socket://' + parsed_args.connection + ':'
+            connection_type = 'socket'
+        else:
+            daemon = 'http://' + parsed_args.connection
+            connection_type = 'http'
+
+    return parsed_args
+
+
+def no_checks_present(parsed_args):
+    # Look for all functions whose name starts with 'check_'
+    checks = [key[6:] for key in globals().keys() if key.startswith('check_')]
+    # Act like --present is a check though it is not implemented like one
+    return all(getattr(parsed_args, check) is None for check in checks) and not parsed_args.present
+
+
+def socketfile_permissions_failure(parsed_args):
+    if connection_type == 'socket':
+        return not (os.path.exists(parsed_args.connection)
+                    and stat.S_ISSOCK(os.stat(parsed_args.connection).st_mode)
+                    and os.access(parsed_args.connection, os.R_OK)
+                    and os.access(parsed_args.connection, os.W_OK))
+    else:
+        return False
+
+
+def print_results():
+    if no_ok:
+        # Remove all the "OK"s
+        filtered_messages = [message for message in messages if not message.startswith('OK: ')]
+        if len(filtered_messages) == 0:
+            messages_concat = 'OK'
+        else:
+            messages_concat = '; '.join(filtered_messages)
+
+    else:
+        messages_concat = '; '.join(messages)
+
+    if no_performance or len(performance_data) == 0:
+        print(messages_concat)
+    else:
+        perfdata_concat = ' '.join(performance_data)
+        print(messages_concat + '|' + perfdata_concat)
+
+
+def perform_checks(raw_args):
+    args = process_args(raw_args)
+
+    global parallel_executor
+    parallel_executor = futures.ThreadPoolExecutor(max_workers=args.threads)
+    global serial_executor
+    serial_executor = futures.ThreadPoolExecutor(max_workers=1)
+
+    global unit_adjustments
+    unit_adjustments = {key: args.units_base ** value for key, value in UNIT_ADJUSTMENTS_TEMPLATE.items()}
+
+    global no_ok
+    no_ok = args.no_ok
+
+    global no_performance
+    no_performance = args.no_ok
+
+    if socketfile_permissions_failure(args):
+        unknown("Cannot access docker socket file. User ID={}, socket file={}".format(os.getuid(), args.connection))
+        return
+
+    if args.containers == ["all"] and args.present:
+        unknown("You can not use --present without --containers")
+        return
+
+    if no_checks_present(args):
+        unknown("No checks specified.")
+        return
+
+    # Here is where all the work happens
+    #############################################################################################
+    containers = get_containers(args.containers, args.present)
+
+    if len(containers) == 0 and not args.present:
+        unknown("No containers names found matching criteria")
+        return
+
+    for container in containers:
+
+        # Check status
+        if args.status:
+            check_status(container, args.status)
+
+        # Check version
+        if args.version:
+            check_version(container, args.insecure_registries)
+
+        # below are checks that require a 'running' status
+
+        # Check status
+        if args.health:
+            check_health(container)
+
+        # Check cpu usage
+        if args.cpu:
+            check_cpu(container, parse_thresholds(args.cpu, units_required=False))
+
+        # Check memory usage
+        if args.memory:
+            check_memory(container, parse_thresholds(args.memory, units_required=False))
+
+        # Check uptime
+        if args.uptime:
+            check_uptime(container, parse_thresholds(args.uptime, include_units=False))
+
+        # Check image age
+        if args.image_age:
+            check_image_age(container, parse_thresholds(args.image_age, include_units=False))
+
+        # Check restart count
+        if args.restarts:
+            check_restarts(container, parse_thresholds(args.restarts, include_units=False))
+
+
+def main():
+    try:
+        perform_checks(argv[1:])
+
+        # get results to let exceptions in threads bubble out
+        [x.result() for x in futures.as_completed(threads)]
+
+    except Exception as e:
+        traceback.print_exc()
+        unknown("Exception raised during check': {}".format(repr(e)))
+    print_results()
+    exit(rc)
+
+
+if __name__ == '__main__':
+    main()
@@ -7,17 +7,25 @@
 #

 rr="/var/run/reboot-required"
+rrpkgs="/var/run/reboot-required.pkgs"
 # 604800 - 1 week in seconds
 threshold="${1:-604800}"
 if [ -f "$rr" ]; then
 	# We have a pending reboot; alert in different states depending on its age
 	lastmod=$(date +%s -r "$rr")
 	now=$(date +%s)
+	pkgs="$(cat "$rr")"
+	if [ -f "$rrpkgs" ]; then
+		pkgs="$(cat "$rrpkgs")"
+	fi
+	if [ -z "$pkgs" ]; then
+		pkgs="(No output)"
+	fi
 	if (( now - lastmod > threshold )); then
-		echo "CRITICAL - Pending reboot older than $threshold seconds: $(cat "$rr")"
+		echo "CRITICAL - Pending reboot older than $threshold seconds: $pkgs"
 		exit 2
 	else
-		echo "WARNING - Pending reboot: $(cat "$rr")"
+		echo "WARNING - Pending reboot: $pkgs"
 		exit 1
 	fi
 else
Author	SHA1	Message	Date
salt	4d63d9f3e9	Add more details to check_reboot_required, adding packages to the output string	2021-09-14 17:20:59 -05:00
salt	dee3bce537	Add check_docker	2021-09-07 15:16:31 -05:00