From dee3bce537fa7f49d545a8235cc49bbbfdd18ade Mon Sep 17 00:00:00 2001 From: Salt Date: Tue, 7 Sep 2021 15:16:31 -0500 Subject: [PATCH] Add check_docker --- check_docker | 996 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 996 insertions(+) create mode 100755 check_docker diff --git a/check_docker b/check_docker new file mode 100755 index 0000000..1a9d754 --- /dev/null +++ b/check_docker @@ -0,0 +1,996 @@ +#!/usr/bin/env python3 +# logging.basicConfig(level=logging.DEBUG) +import argparse +import json +import logging +import math +import os +import re +import socket +import stat +import traceback +from collections import deque, namedtuple, UserDict, defaultdict +from concurrent import futures +from datetime import datetime, timezone +from functools import lru_cache +from http.client import HTTPConnection +from sys import argv +from urllib import request +from urllib.error import HTTPError, URLError +from urllib.request import AbstractHTTPHandler, HTTPHandler, HTTPSHandler, OpenerDirector, HTTPRedirectHandler, \ + Request, HTTPBasicAuthHandler + +logger = logging.getLogger() +__author__ = 'Tim Laurence' +__copyright__ = "Copyright 2019" +__credits__ = ['Tim Laurence'] +__license__ = "GPL" +__version__ = "2.2.2" + +''' +nrpe compatible check for docker containers. + +Requires Python 3 + +Note: I really would have preferred to have used requests for all the network connections but that would have added a +dependency. +''' + +DEFAULT_SOCKET = '/var/run/docker.sock' +DEFAULT_TIMEOUT = 10.0 +DEFAULT_PORT = 2375 +DEFAULT_MEMORY_UNITS = 'B' +DEFAULT_HEADERS = [('Accept', 'application/vnd.docker.distribution.manifest.v2+json')] +DEFAULT_PUBLIC_REGISTRY = 'registry-1.docker.io' + +# The second value is the power to raise the base to. +UNIT_ADJUSTMENTS_TEMPLATE = { + '%': 0, + 'B': 0, + 'KB': 1, + 'MB': 2, + 'GB': 3, + 'TB': 4 +} +unit_adjustments = None + +# Reduce message to a single OK unless a checks fail. +no_ok = False + +# Suppress performance data reporting +no_performance = False + +OK_RC = 0 +WARNING_RC = 1 +CRITICAL_RC = 2 +UNKNOWN_RC = 3 + +# These hold the final results +rc = -1 +messages = [] +performance_data = [] + +ImageName = namedtuple('ImageName', "registry name tag full_name") + + +class ThresholdSpec(UserDict): + def __init__(self, warn, crit, units=''): + super().__init__(warn=warn, crit=crit, units=units) + + def __getattr__(self, item): + if item in ('warn', 'crit', 'units'): + return self.data[item] + return super().__getattr__(item) + + +# How much threading can we do? We are generally not CPU bound so I am using this a worse case cap +DEFAULT_PARALLELISM = 10 + +# Holds list of all threads +threads = [] + +# This is used during testing +DISABLE_THREADING = False + + +# Hacked up urllib to handle sockets +############################################################################################# +# Docker runs a http connection over a socket. http.client is knows how to deal with these +# but lacks some niceties. Urllib wraps that and makes up for some of the deficiencies but +# cannot fix the fact http.client can't read from socket files. In order to take advantage of +# urllib and http.client's capabilities the class below tweaks HttpConnection and passes it +# to urllib registering for socket:// connections + +# This is all side effect so excluding coverage +class SocketFileHandler(AbstractHTTPHandler): + class SocketFileToHttpConnectionAdaptor(HTTPConnection): # pragma: no cover + def __init__(self, socket_file, timeout=DEFAULT_TIMEOUT): + super().__init__(host='', port=0, timeout=timeout) + self.socket_file = socket_file + + def connect(self): + self.sock = socket.socket(family=socket.AF_UNIX, type=socket.SOCK_STREAM, proto=0, fileno=None) + self.sock.settimeout(self.timeout) + self.sock.connect(self.socket_file) + + def socket_open(self, req): + socket_file, path = req.selector.split(':', 1) + req.host = socket_file + req.selector = path + return self.do_open(self.SocketFileToHttpConnectionAdaptor, req) + + +# Tokens are not cached because I expect the callers to cache the responses +class Oauth2TokenAuthHandler(HTTPBasicAuthHandler): + auth_failure_tracker = defaultdict(int) + + def http_response(self, request, response): + code, hdrs = response.code, response.headers + + www_authenticate_header = response.headers.get('www-authenticate', None) + if code == 401 and www_authenticate_header: + scheme = www_authenticate_header.split()[0] + if scheme.lower() == 'bearer': + return self.process_oauth2(request, response, www_authenticate_header) + + return response + + https_response = http_response + + @staticmethod + def _get_outh2_token(www_authenticate_header): + auth_fields = dict(re.findall(r"""(?:(?P[^ ,=]+)="([^"]+)")""", www_authenticate_header)) + + auth_url = "{realm}?scope={scope}&service={service}".format( + realm=auth_fields['realm'], + scope=auth_fields['scope'], + service=auth_fields['service'], + ) + token_request = Request(auth_url) + token_request.add_header("Content-Type", "application/x-www-form-urlencoded; charset=utf-8") + token_response = request.urlopen(token_request) + return process_urllib_response(token_response)['token'] + + def process_oauth2(self, request, response, www_authenticate_header): + + # This keeps infinite auth loops from happening + full_url = request.full_url + self.auth_failure_tracker[full_url] += 1 + if self.auth_failure_tracker[full_url] > 1: + raise HTTPError(full_url, 401, "Stopping Oauth2 failure loop for {}".format(full_url), + response.headers, response) + + auth_token = self._get_outh2_token(www_authenticate_header) + + request.add_unredirected_header('Authorization', 'Bearer ' + auth_token) + return self.parent.open(request, timeout=request.timeout) + + +better_urllib_get = OpenerDirector() +better_urllib_get.addheaders = DEFAULT_HEADERS.copy() +better_urllib_get.add_handler(HTTPHandler()) +better_urllib_get.add_handler(HTTPSHandler()) +better_urllib_get.add_handler(HTTPRedirectHandler()) +better_urllib_get.add_handler(SocketFileHandler()) +better_urllib_get.add_handler(Oauth2TokenAuthHandler()) + + +class RegistryError(Exception): + def __init__(self, response): + self.response_obj = response + + +# Util functions +############################################################################################# +def parse_thresholds(spec, include_units=True, units_required=True): + """ + Given a spec string break it up into ':' separated chunks. Convert strings to ints as it makes sense + + :param spec: The threshold specification being parsed + :param include_units: Specifies that units should be processed and returned if present + :param units_required: Mark spec as invalid if the units are missing. + :return: A list containing the thresholds in order of warn, crit, and units(if included and present) + """ + parts = deque(spec.split(':')) + if not all(parts): + raise ValueError("Blanks are not allowed in a threshold specification: {}".format(spec)) + + # Warn + warn = int(parts.popleft()) + # Crit + crit = int(parts.popleft()) + + units = '' + if include_units: + if len(parts): + # units + units = parts.popleft() + elif units_required: + raise ValueError("Missing units in {}".format(spec)) + + if len(parts) != 0: + raise ValueError("Too many threshold specifiers in {}".format(spec)) + + return ThresholdSpec(warn=warn, crit=crit, units=units) + + +def pretty_time(seconds): + remainder = seconds + result = [] + if remainder > 24 * 60 * 60: + days, remainder = divmod(remainder, 24 * 60 * 60) + result.append("{}d".format(int(days))) + if remainder > 60 * 60: + hours, remainder = divmod(remainder, 60 * 60) + result.append("{}h".format(int(hours))) + if remainder > 60: + minutes, remainder = divmod(remainder, 60) + result.append("{}min".format(int(minutes))) + result.append("{}s".format(int(remainder))) + return result + + +def evaluate_numeric_thresholds(container, value, thresholds, name, short_name, + min=None, max=None, greater_than=True): + rounder = lambda x: round(x, 2) + + INTEGER_UNITS = ['B', '%', ''] + + # Some units don't have decimal places + rounded_value = int(value) if thresholds.units in INTEGER_UNITS else rounder(value) + + perf_string = "{container}_{short_name}={value}{units};{warn};{crit}".format( + container=container, + short_name=short_name, + value=rounded_value, + **thresholds) + if min is not None: + rounded_min = math.floor(min) if thresholds.units in INTEGER_UNITS else rounder(min) + perf_string += ';{}'.format(rounded_min) + if max is not None: + rounded_max = math.ceil(max) if thresholds.units in INTEGER_UNITS else rounder(max) + perf_string += ';{}'.format(rounded_max) + + global performance_data + performance_data.append(perf_string) + + if thresholds.units == 's': + nice_time = ' '.join(pretty_time(rounded_value)[:2]) + results_str = "{} {} is {}".format(container, name, nice_time) + else: + results_str = "{} {} is {}{}".format(container, name, rounded_value, thresholds.units) + + if greater_than: + comparator = lambda value, threshold: value >= threshold + else: + comparator = lambda value, threshold: value <= threshold + + if comparator(value, thresholds.crit): + critical(results_str) + elif comparator(value, thresholds.warn): + warning(results_str) + else: + ok(results_str) + + +@lru_cache(maxsize=None) +def get_url(url): + logger.debug("get_url: {}".format(url)) + response = better_urllib_get.open(url, timeout=timeout) + logger.debug("get_url: {} {}".format(url, response.status)) + return process_urllib_response(response), response.status + + +def process_urllib_response(response): + response_bytes = response.read() + body = response_bytes.decode('utf-8') + # logger.debug("BODY: {}".format(body)) + return json.loads(body) + + +def get_container_info(name): + content, _ = get_url(daemon + '/containers/{container}/json'.format(container=name)) + return content + + +def get_image_info(name): + content, _ = get_url(daemon + '/images/{image}/json'.format(image=name)) + return content + + +def get_state(container): + return get_container_info(container)['State'] + + +def get_stats(container): + content, _ = get_url(daemon + '/containers/{container}/stats?stream=0'.format(container=container)) + return content + + +def get_ps_name(name_list): + # Pick the name that starts with a '/' but doesn't contain a '/' and return that value + for name in name_list: + if '/' not in name[1:] and name[0] == '/': + return name[1:] + else: + raise NameError("Error when trying to identify 'ps' name in {}".format(name_list)) + + +def get_containers(names, require_present): + containers_list, _ = get_url(daemon + '/containers/json?all=1') + + all_container_names = set(get_ps_name(x['Names']) for x in containers_list) + + if 'all' in names: + return all_container_names + + filtered = set() + for matcher in names: + found = False + for candidate in all_container_names: + if re.match("^{}$".format(matcher), candidate): + filtered.add(candidate) + found = True + # If we don't find a container that matches out regex + if require_present and not found: + critical("No containers match {}".format(matcher)) + + return filtered + + +def get_container_image_id(container): + # find registry and tag + inspection = get_container_info(container) + return inspection['Image'] + + +def get_container_image_urls(container): + inspection = get_container_info(container) + image_id = inspection['Image'] + image_info = get_image_info(image_id) + return image_info['RepoTags'] + + +def normalize_image_name_to_manifest_url(image_name, insecure_registries): + parsed_url = parse_image_name(image_name) + + lower_insecure = [reg.lower() for reg in insecure_registries] + + # Registry query url + scheme = 'http' if parsed_url.registry.lower() in lower_insecure else 'https' + url = '{scheme}://{registry}/v2/{image_name}/manifests/{image_tag}'.format(scheme=scheme, + registry=parsed_url.registry, + image_name=parsed_url.name, + image_tag=parsed_url.tag) + return url, parsed_url.registry + + +# Auth servers seem picky about being hit too hard. Can't figure out why. ;) +# As result it is best to single thread this check +# This is based on https://docs.docker.com/registry/spec/auth/token/#requesting-a-token +def get_digest_from_registry(url): + logger.debug("get_digest_from_registry") + # query registry + # TODO: Handle logging in if needed + registry_info, status_code = get_url(url=url) + + if status_code != 200: + raise RegistryError(response=registry_info) + return registry_info['config'].get('digest', None) + + +def set_rc(new_rc): + global rc + rc = new_rc if new_rc > rc else rc + + +def ok(message): + set_rc(OK_RC) + messages.append('OK: ' + message) + + +def warning(message): + set_rc(WARNING_RC) + messages.append('WARNING: ' + message) + + +def critical(message): + set_rc(CRITICAL_RC) + messages.append('CRITICAL: ' + message) + + +def unknown(message): + set_rc(UNKNOWN_RC) + messages.append('UNKNOWN: ' + message) + + +def require_running(name): + def inner_decorator(func): + def wrapper(container, *args, **kwargs): + container_state = get_state(container) + state = normalize_state(container_state) + if state.lower() == "running": + func(container, *args, **kwargs) + else: + # container is not running, can't perform check + critical('{container} is not "running", cannot check {check}"'.format(container=container, + check=name)) + + return wrapper + + return inner_decorator + + +def multithread_execution(disable_threading=DISABLE_THREADING): + def inner_decorator(func): + def wrapper(container, *args, **kwargs): + if DISABLE_THREADING: + func(container, *args, **kwargs) + else: + threads.append(parallel_executor.submit(func, container, *args, **kwargs)) + + return wrapper + + return inner_decorator + + +def singlethread_execution(disable_threading=DISABLE_THREADING): + def inner_decorator(func): + def wrapper(container, *args, **kwargs): + if DISABLE_THREADING: + func(container, *args, **kwargs) + else: + threads.append(serial_executor.submit(func, container, *args, **kwargs)) + + return wrapper + + return inner_decorator + + +def parse_image_name(image_name): + """ + Parses image names into their constituent parts. + :param image_name: + :return: ImageName + """ + + # These are based on information found here + # https://docs.docker.com/engine/reference/commandline/tag/#extended-description + # https://github.com/docker/distribution/blob/master/reference/regexp.go + host_segment_re = '[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?' + hostname_re = r'({host_segment}\.)+{host_segment}'.format(host_segment=host_segment_re) + registry_re = r'((?P({hostname_re}(:\d+)?|{host_segment_re}:\d+))/)'.format( + host_segment_re=host_segment_re, hostname_re=hostname_re) + name_component_ends_re = '[a-z0-9]' + name_component_middle_re = '[a-z0-9._-]' # Ignoring spec limit of two _ + name_component_re = '({end}{middle}*{end}|{end})'.format(end=name_component_ends_re, + middle=name_component_middle_re) + image_name_re = "(?P({name_component}/)*{name_component})".format(name_component=name_component_re) + image_tag_re = '(?P[a-zA-Z0-9_][a-zA-Z0-9_.-]*)' + full_re = '^{registry}?{image_name}(:{image_tag})?$'.format(registry=registry_re, image_name=image_name_re, + image_tag=image_tag_re) + parsed = re.match(full_re, image_name) + + registry = parsed.group('registry') if parsed.group('registry') else DEFAULT_PUBLIC_REGISTRY + + image_name = parsed.group('image_name') + image_name = image_name if '/' in image_name or registry != DEFAULT_PUBLIC_REGISTRY else 'library/' + image_name + + image_tag = parsed.group('image_tag') + image_tag = image_tag if image_tag else 'latest' + + full_image_name = "{registry}/{image_name}:{image_tag}".format( + registry=registry, + image_name=image_name, + image_tag=image_tag) + + return ImageName(registry=registry, name=image_name, tag=image_tag, full_name=full_image_name) + + +def normalize_state(status_info): + # Ugh, docker used to report state in as silly way then they figured out how to do it better. + # This tries the simpler new way and if that doesn't work fails back to the old way + + # On new docker engines the status holds whatever the current state is, running, stopped, paused, etc. + if "Status" in status_info: + return status_info['Status'] + + status = 'Exited' + if status_info["Restarting"]: + status = 'Restarting' + elif status_info["Paused"]: + status = 'Paused' + elif status_info["Dead"]: + status = 'Dead' + elif status_info["Running"]: + return "Running" + return status + + +# Checks +############################################################################################# + +@multithread_execution() +@require_running(name='memory') +def check_memory(container, thresholds): + if not thresholds.units in unit_adjustments: + unknown("Memory units must be one of {}".format(list(unit_adjustments.keys()))) + return + + inspection = get_stats(container) + + # Subtracting cache to match what `docker stats` does. + adjusted_usage = inspection['memory_stats']['usage'] - inspection['memory_stats']['stats']['total_cache'] + if thresholds.units == '%': + max = 100 + usage = int(100 * adjusted_usage / inspection['memory_stats']['limit']) + else: + max = inspection['memory_stats']['limit'] / unit_adjustments[thresholds.units] + usage = adjusted_usage / unit_adjustments[thresholds.units] + + evaluate_numeric_thresholds(container=container, value=usage, thresholds=thresholds, name='memory', + short_name='mem', min=0, max=max) + + +@multithread_execution() +def check_status(container, desired_state): + normized_desired_state = desired_state.lower() + normalized_state = normalize_state(get_state(container)).lower() + if normized_desired_state != normalized_state: + critical("{} state is not {}".format(container, desired_state)) + return + ok("{} status is {}".format(container, desired_state)) + + +@multithread_execution() +@require_running('health') +def check_health(container): + state = get_state(container) + if "Health" in state and "Status" in state["Health"]: + health = state["Health"]["Status"] + message = "{} is {}".format(container, health) + if health == 'healthy': + ok(message) + elif health == 'unhealthy': + critical(message) + else: + unknown(message) + else: + unknown('{} has no health check data'.format(container)) + + +@multithread_execution() +@require_running('uptime') +def check_uptime(container, thresholds): + inspection = get_container_info(container)['State']['StartedAt'] + only_secs = inspection[0:19] + start = datetime.strptime(only_secs, "%Y-%m-%dT%H:%M:%S") + start = start.replace(tzinfo=timezone.utc) + now = datetime.now(timezone.utc) + uptime = (now - start).total_seconds() + + graph_padding = 2 + thresholds.units = 's' + evaluate_numeric_thresholds(container=container, value=uptime, thresholds=thresholds, name='uptime', + short_name='up', min=0, max=graph_padding, greater_than=False) + + +@multithread_execution() +def check_image_age(container, thresholds): + container_image = get_container_info(container)['Image'] + image_created = get_image_info(container_image)['Created'] + only_secs = image_created[0:19] + start = datetime.strptime(only_secs, "%Y-%m-%dT%H:%M:%S") + start = start.replace(tzinfo=timezone.utc) + now = datetime.now(timezone.utc) + image_age = (now - start).days + + graph_padding = 2 + thresholds.units = 'd' + evaluate_numeric_thresholds(container=container, value=image_age, thresholds=thresholds, name='image_age', + short_name='age', min=0, max=graph_padding, greater_than=True) + + +@multithread_execution() +@require_running('restarts') +def check_restarts(container, thresholds): + inspection = get_container_info(container) + + restarts = int(inspection['RestartCount']) + graph_padding = 2 + evaluate_numeric_thresholds(container=container, value=restarts, thresholds=thresholds, name='restarts', + short_name='re', min=0, max=graph_padding) + + +@singlethread_execution() +def check_version(container, insecure_registries): + image_id = get_container_image_id(container) + logger.debug("Local container image ID: {}".format(image_id)) + if image_id is None: + unknown('Checksum missing for "{}", try doing a pull'.format(container)) + return + + image_urls = get_container_image_urls(container=container) + if len(image_urls) > 1: + unknown('"{}" has multiple tags/names. Unsure which one to use to check the version.'.format(container)) + return + elif len(image_urls) == 0: + unknown('"{}" has last no repository tag. Is this anywhere else?'.format(container)) + return + + url, registry = normalize_image_name_to_manifest_url(image_urls[0], insecure_registries) + logger.debug("Looking up image digest here {}".format(url)) + try: + registry_hash = get_digest_from_registry(url) + except URLError as e: + if hasattr(e.reason, 'reason') and e.reason.reason == 'UNKNOWN_PROTOCOL': + unknown( + "TLS error connecting to registry {} for {}, should you use the '--insecure-registry' flag?" \ + .format(registry, container)) + return + elif hasattr(e.reason, 'strerror') and e.reason.strerror == 'nodename nor servname provided, or not known': + unknown( + "Cannot reach registry for {} at {}".format(container, url)) + return + else: + raise e + except RegistryError as e: + unknown("Cannot check version, couldn't retrieve digest for {} while checking {}.".format(container, url)) + return + logger.debug("Image digests, local={} remote={}".format(image_id, registry_hash)) + if registry_hash == image_id: + ok("{}'s version matches registry".format(container)) + return + critical("{}'s version does not match registry".format(container)) + + +def calculate_cpu_capacity_precentage(info, stats): + host_config = info['HostConfig'] + + if 'online_cpus' in stats['cpu_stats']: + num_cpus = stats['cpu_stats']['online_cpus'] + else: + num_cpus = len(stats['cpu_stats']['cpu_usage']['percpu_usage']) + + # Identify limit system being used + # --cpus + if 'NanoCpus' in host_config and host_config['NanoCpus'] != 0: + period = 1000000000 + quota = host_config['NanoCpus'] + # --cpu-quota + elif 'CpuQuota' in host_config and host_config['CpuQuota'] != 0: + period = 100000 if host_config['CpuPeriod'] == 0 else host_config['CpuPeriod'] + quota = host_config['CpuQuota'] + # unlimited + else: + period = 1 + quota = num_cpus + + if period * num_cpus < quota: + # This handles the case where the quota is actually bigger than amount available by all the cpus. + available_limit_ratio = 1 + else: + available_limit_ratio = (period * num_cpus) / quota + + cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage'] + system_delta = stats['cpu_stats']['system_cpu_usage'] - stats['precpu_stats']['system_cpu_usage'] + usage = (cpu_delta / system_delta) * available_limit_ratio + usage = round(usage * 100, 0) + return usage + + +@multithread_execution() +@require_running('cpu') +def check_cpu(container, thresholds): + info = get_container_info(container) + + stats = get_stats(container=container) + + usage = calculate_cpu_capacity_precentage(info=info, stats=stats) + + max = 100 + thresholds.units = '%' + evaluate_numeric_thresholds(container=container, value=usage, thresholds=thresholds, name='cpu', short_name='cpu', + min=0, max=max) + + +def process_args(args): + parser = argparse.ArgumentParser(description='Check docker containers.') + + # Connect to local socket or ip address + connection_group = parser.add_mutually_exclusive_group() + connection_group.add_argument('--connection', + dest='connection', + action='store', + default=DEFAULT_SOCKET, + type=str, + metavar='[//docker.socket|:]', + help='Where to find docker daemon socket. (default: %(default)s)') + + connection_group.add_argument('--secure-connection', + dest='secure_connection', + action='store', + type=str, + metavar='[:]', + help='Where to find TLS protected docker daemon socket.') + + base_group = parser.add_mutually_exclusive_group() + base_group.add_argument('--binary_units', + dest='units_base', + action='store_const', + const=1024, + help='Use a base of 1024 when doing calculations of KB, MB, GB, & TB (This is default)') + + base_group.add_argument('--decimal_units', + dest='units_base', + action='store_const', + const=1000, + help='Use a base of 1000 when doing calculations of KB, MB, GB, & TB') + parser.set_defaults(units_base=1024) + + # Connection timeout + parser.add_argument('--timeout', + dest='timeout', + action='store', + type=float, + default=DEFAULT_TIMEOUT, + help='Connection timeout in seconds. (default: %(default)s)') + + # Container name + parser.add_argument('--containers', + dest='containers', + action='store', + nargs='+', + type=str, + default=['all'], + help='One or more RegEx that match the names of the container(s) to check. If omitted all containers are checked. (default: %(default)s)') + + # Container name + parser.add_argument('--present', + dest='present', + default=False, + action='store_true', + help='Modifies --containers so that each RegEx must match at least one container.') + + # Threads + parser.add_argument('--threads', + dest='threads', + default=DEFAULT_PARALLELISM, + action='store', + type=int, + help='This + 1 is the maximum number of concurent threads/network connections. (default: %(default)s)') + + # CPU + parser.add_argument('--cpu', + dest='cpu', + action='store', + type=str, + metavar='WARN:CRIT', + help='Check cpu usage percentage taking into account any limits.') + + # Memory + parser.add_argument('--memory', + dest='memory', + action='store', + type=str, + metavar='WARN:CRIT:UNITS', + help='Check memory usage taking into account any limits. Valid values for units are %%,B,KB,MB,GB.') + + # State + parser.add_argument('--status', + dest='status', + action='store', + type=str, + help='Desired container status (running, exited, etc).') + + # Health + parser.add_argument('--health', + dest='health', + default=None, + action='store_true', + help="Check container's health check status") + + # Age + parser.add_argument('--uptime', + dest='uptime', + action='store', + type=str, + metavar='WARN:CRIT', + help='Minimum container uptime in seconds. Use when infrequent crashes are tolerated.') + + # Image Age + parser.add_argument('--image-age', + dest='image_age', + action='store', + type=str, + metavar='WARN:CRIT', + help='Maximum image age in days.') + + # Version + parser.add_argument('--version', + dest='version', + default=None, + action='store_true', + help='Check if the running images are the same version as those in the registry. Useful for finding stale images. Does not support login.') + + # Version + parser.add_argument('--insecure-registries', + dest='insecure_registries', + action='store', + nargs='+', + type=str, + default=[], + help='List of registries to connect to with http(no TLS). Useful when using "--version" with images from insecure registries.') + + # Restart + parser.add_argument('--restarts', + dest='restarts', + action='store', + type=str, + metavar='WARN:CRIT', + help='Container restart thresholds.') + + # no-ok + parser.add_argument('--no-ok', + dest='no_ok', + action='store_true', + help='Make output terse suppressing OK messages. If all checks are OK return a single OK.') + + # no-performance + parser.add_argument('--no-performance', + dest='no_performance', + action='store_true', + help='Suppress performance data. Reduces output when performance data is not being used.') + + parser.add_argument('-V', action='version', version='%(prog)s {}'.format(__version__)) + + if len(args) == 0: + parser.print_help() + + parsed_args = parser.parse_args(args=args) + + global timeout + timeout = parsed_args.timeout + + global daemon + global connection_type + if parsed_args.secure_connection: + daemon = 'https://' + parsed_args.secure_connection + connection_type = 'https' + elif parsed_args.connection: + if parsed_args.connection[0] == '/': + daemon = 'socket://' + parsed_args.connection + ':' + connection_type = 'socket' + else: + daemon = 'http://' + parsed_args.connection + connection_type = 'http' + + return parsed_args + + +def no_checks_present(parsed_args): + # Look for all functions whose name starts with 'check_' + checks = [key[6:] for key in globals().keys() if key.startswith('check_')] + # Act like --present is a check though it is not implemented like one + return all(getattr(parsed_args, check) is None for check in checks) and not parsed_args.present + + +def socketfile_permissions_failure(parsed_args): + if connection_type == 'socket': + return not (os.path.exists(parsed_args.connection) + and stat.S_ISSOCK(os.stat(parsed_args.connection).st_mode) + and os.access(parsed_args.connection, os.R_OK) + and os.access(parsed_args.connection, os.W_OK)) + else: + return False + + +def print_results(): + if no_ok: + # Remove all the "OK"s + filtered_messages = [message for message in messages if not message.startswith('OK: ')] + if len(filtered_messages) == 0: + messages_concat = 'OK' + else: + messages_concat = '; '.join(filtered_messages) + + else: + messages_concat = '; '.join(messages) + + if no_performance or len(performance_data) == 0: + print(messages_concat) + else: + perfdata_concat = ' '.join(performance_data) + print(messages_concat + '|' + perfdata_concat) + + +def perform_checks(raw_args): + args = process_args(raw_args) + + global parallel_executor + parallel_executor = futures.ThreadPoolExecutor(max_workers=args.threads) + global serial_executor + serial_executor = futures.ThreadPoolExecutor(max_workers=1) + + global unit_adjustments + unit_adjustments = {key: args.units_base ** value for key, value in UNIT_ADJUSTMENTS_TEMPLATE.items()} + + global no_ok + no_ok = args.no_ok + + global no_performance + no_performance = args.no_ok + + if socketfile_permissions_failure(args): + unknown("Cannot access docker socket file. User ID={}, socket file={}".format(os.getuid(), args.connection)) + return + + if args.containers == ["all"] and args.present: + unknown("You can not use --present without --containers") + return + + if no_checks_present(args): + unknown("No checks specified.") + return + + # Here is where all the work happens + ############################################################################################# + containers = get_containers(args.containers, args.present) + + if len(containers) == 0 and not args.present: + unknown("No containers names found matching criteria") + return + + for container in containers: + + # Check status + if args.status: + check_status(container, args.status) + + # Check version + if args.version: + check_version(container, args.insecure_registries) + + # below are checks that require a 'running' status + + # Check status + if args.health: + check_health(container) + + # Check cpu usage + if args.cpu: + check_cpu(container, parse_thresholds(args.cpu, units_required=False)) + + # Check memory usage + if args.memory: + check_memory(container, parse_thresholds(args.memory, units_required=False)) + + # Check uptime + if args.uptime: + check_uptime(container, parse_thresholds(args.uptime, include_units=False)) + + # Check image age + if args.image_age: + check_image_age(container, parse_thresholds(args.image_age, include_units=False)) + + # Check restart count + if args.restarts: + check_restarts(container, parse_thresholds(args.restarts, include_units=False)) + + +def main(): + try: + perform_checks(argv[1:]) + + # get results to let exceptions in threads bubble out + [x.result() for x in futures.as_completed(threads)] + + except Exception as e: + traceback.print_exc() + unknown("Exception raised during check': {}".format(repr(e))) + print_results() + exit(rc) + + +if __name__ == '__main__': + main()