Add check_docker
This commit is contained in:
parent
83081b757b
commit
dee3bce537
996
check_docker
Executable file
996
check_docker
Executable file
@ -0,0 +1,996 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# logging.basicConfig(level=logging.DEBUG)
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import socket
|
||||||
|
import stat
|
||||||
|
import traceback
|
||||||
|
from collections import deque, namedtuple, UserDict, defaultdict
|
||||||
|
from concurrent import futures
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from functools import lru_cache
|
||||||
|
from http.client import HTTPConnection
|
||||||
|
from sys import argv
|
||||||
|
from urllib import request
|
||||||
|
from urllib.error import HTTPError, URLError
|
||||||
|
from urllib.request import AbstractHTTPHandler, HTTPHandler, HTTPSHandler, OpenerDirector, HTTPRedirectHandler, \
|
||||||
|
Request, HTTPBasicAuthHandler
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
__author__ = 'Tim Laurence'
|
||||||
|
__copyright__ = "Copyright 2019"
|
||||||
|
__credits__ = ['Tim Laurence']
|
||||||
|
__license__ = "GPL"
|
||||||
|
__version__ = "2.2.2"
|
||||||
|
|
||||||
|
'''
|
||||||
|
nrpe compatible check for docker containers.
|
||||||
|
|
||||||
|
Requires Python 3
|
||||||
|
|
||||||
|
Note: I really would have preferred to have used requests for all the network connections but that would have added a
|
||||||
|
dependency.
|
||||||
|
'''
|
||||||
|
|
||||||
|
DEFAULT_SOCKET = '/var/run/docker.sock'
|
||||||
|
DEFAULT_TIMEOUT = 10.0
|
||||||
|
DEFAULT_PORT = 2375
|
||||||
|
DEFAULT_MEMORY_UNITS = 'B'
|
||||||
|
DEFAULT_HEADERS = [('Accept', 'application/vnd.docker.distribution.manifest.v2+json')]
|
||||||
|
DEFAULT_PUBLIC_REGISTRY = 'registry-1.docker.io'
|
||||||
|
|
||||||
|
# The second value is the power to raise the base to.
|
||||||
|
UNIT_ADJUSTMENTS_TEMPLATE = {
|
||||||
|
'%': 0,
|
||||||
|
'B': 0,
|
||||||
|
'KB': 1,
|
||||||
|
'MB': 2,
|
||||||
|
'GB': 3,
|
||||||
|
'TB': 4
|
||||||
|
}
|
||||||
|
unit_adjustments = None
|
||||||
|
|
||||||
|
# Reduce message to a single OK unless a checks fail.
|
||||||
|
no_ok = False
|
||||||
|
|
||||||
|
# Suppress performance data reporting
|
||||||
|
no_performance = False
|
||||||
|
|
||||||
|
OK_RC = 0
|
||||||
|
WARNING_RC = 1
|
||||||
|
CRITICAL_RC = 2
|
||||||
|
UNKNOWN_RC = 3
|
||||||
|
|
||||||
|
# These hold the final results
|
||||||
|
rc = -1
|
||||||
|
messages = []
|
||||||
|
performance_data = []
|
||||||
|
|
||||||
|
ImageName = namedtuple('ImageName', "registry name tag full_name")
|
||||||
|
|
||||||
|
|
||||||
|
class ThresholdSpec(UserDict):
|
||||||
|
def __init__(self, warn, crit, units=''):
|
||||||
|
super().__init__(warn=warn, crit=crit, units=units)
|
||||||
|
|
||||||
|
def __getattr__(self, item):
|
||||||
|
if item in ('warn', 'crit', 'units'):
|
||||||
|
return self.data[item]
|
||||||
|
return super().__getattr__(item)
|
||||||
|
|
||||||
|
|
||||||
|
# How much threading can we do? We are generally not CPU bound so I am using this a worse case cap
|
||||||
|
DEFAULT_PARALLELISM = 10
|
||||||
|
|
||||||
|
# Holds list of all threads
|
||||||
|
threads = []
|
||||||
|
|
||||||
|
# This is used during testing
|
||||||
|
DISABLE_THREADING = False
|
||||||
|
|
||||||
|
|
||||||
|
# Hacked up urllib to handle sockets
|
||||||
|
#############################################################################################
|
||||||
|
# Docker runs a http connection over a socket. http.client is knows how to deal with these
|
||||||
|
# but lacks some niceties. Urllib wraps that and makes up for some of the deficiencies but
|
||||||
|
# cannot fix the fact http.client can't read from socket files. In order to take advantage of
|
||||||
|
# urllib and http.client's capabilities the class below tweaks HttpConnection and passes it
|
||||||
|
# to urllib registering for socket:// connections
|
||||||
|
|
||||||
|
# This is all side effect so excluding coverage
|
||||||
|
class SocketFileHandler(AbstractHTTPHandler):
|
||||||
|
class SocketFileToHttpConnectionAdaptor(HTTPConnection): # pragma: no cover
|
||||||
|
def __init__(self, socket_file, timeout=DEFAULT_TIMEOUT):
|
||||||
|
super().__init__(host='', port=0, timeout=timeout)
|
||||||
|
self.socket_file = socket_file
|
||||||
|
|
||||||
|
def connect(self):
|
||||||
|
self.sock = socket.socket(family=socket.AF_UNIX, type=socket.SOCK_STREAM, proto=0, fileno=None)
|
||||||
|
self.sock.settimeout(self.timeout)
|
||||||
|
self.sock.connect(self.socket_file)
|
||||||
|
|
||||||
|
def socket_open(self, req):
|
||||||
|
socket_file, path = req.selector.split(':', 1)
|
||||||
|
req.host = socket_file
|
||||||
|
req.selector = path
|
||||||
|
return self.do_open(self.SocketFileToHttpConnectionAdaptor, req)
|
||||||
|
|
||||||
|
|
||||||
|
# Tokens are not cached because I expect the callers to cache the responses
|
||||||
|
class Oauth2TokenAuthHandler(HTTPBasicAuthHandler):
|
||||||
|
auth_failure_tracker = defaultdict(int)
|
||||||
|
|
||||||
|
def http_response(self, request, response):
|
||||||
|
code, hdrs = response.code, response.headers
|
||||||
|
|
||||||
|
www_authenticate_header = response.headers.get('www-authenticate', None)
|
||||||
|
if code == 401 and www_authenticate_header:
|
||||||
|
scheme = www_authenticate_header.split()[0]
|
||||||
|
if scheme.lower() == 'bearer':
|
||||||
|
return self.process_oauth2(request, response, www_authenticate_header)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
https_response = http_response
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_outh2_token(www_authenticate_header):
|
||||||
|
auth_fields = dict(re.findall(r"""(?:(?P<key>[^ ,=]+)="([^"]+)")""", www_authenticate_header))
|
||||||
|
|
||||||
|
auth_url = "{realm}?scope={scope}&service={service}".format(
|
||||||
|
realm=auth_fields['realm'],
|
||||||
|
scope=auth_fields['scope'],
|
||||||
|
service=auth_fields['service'],
|
||||||
|
)
|
||||||
|
token_request = Request(auth_url)
|
||||||
|
token_request.add_header("Content-Type", "application/x-www-form-urlencoded; charset=utf-8")
|
||||||
|
token_response = request.urlopen(token_request)
|
||||||
|
return process_urllib_response(token_response)['token']
|
||||||
|
|
||||||
|
def process_oauth2(self, request, response, www_authenticate_header):
|
||||||
|
|
||||||
|
# This keeps infinite auth loops from happening
|
||||||
|
full_url = request.full_url
|
||||||
|
self.auth_failure_tracker[full_url] += 1
|
||||||
|
if self.auth_failure_tracker[full_url] > 1:
|
||||||
|
raise HTTPError(full_url, 401, "Stopping Oauth2 failure loop for {}".format(full_url),
|
||||||
|
response.headers, response)
|
||||||
|
|
||||||
|
auth_token = self._get_outh2_token(www_authenticate_header)
|
||||||
|
|
||||||
|
request.add_unredirected_header('Authorization', 'Bearer ' + auth_token)
|
||||||
|
return self.parent.open(request, timeout=request.timeout)
|
||||||
|
|
||||||
|
|
||||||
|
better_urllib_get = OpenerDirector()
|
||||||
|
better_urllib_get.addheaders = DEFAULT_HEADERS.copy()
|
||||||
|
better_urllib_get.add_handler(HTTPHandler())
|
||||||
|
better_urllib_get.add_handler(HTTPSHandler())
|
||||||
|
better_urllib_get.add_handler(HTTPRedirectHandler())
|
||||||
|
better_urllib_get.add_handler(SocketFileHandler())
|
||||||
|
better_urllib_get.add_handler(Oauth2TokenAuthHandler())
|
||||||
|
|
||||||
|
|
||||||
|
class RegistryError(Exception):
|
||||||
|
def __init__(self, response):
|
||||||
|
self.response_obj = response
|
||||||
|
|
||||||
|
|
||||||
|
# Util functions
|
||||||
|
#############################################################################################
|
||||||
|
def parse_thresholds(spec, include_units=True, units_required=True):
|
||||||
|
"""
|
||||||
|
Given a spec string break it up into ':' separated chunks. Convert strings to ints as it makes sense
|
||||||
|
|
||||||
|
:param spec: The threshold specification being parsed
|
||||||
|
:param include_units: Specifies that units should be processed and returned if present
|
||||||
|
:param units_required: Mark spec as invalid if the units are missing.
|
||||||
|
:return: A list containing the thresholds in order of warn, crit, and units(if included and present)
|
||||||
|
"""
|
||||||
|
parts = deque(spec.split(':'))
|
||||||
|
if not all(parts):
|
||||||
|
raise ValueError("Blanks are not allowed in a threshold specification: {}".format(spec))
|
||||||
|
|
||||||
|
# Warn
|
||||||
|
warn = int(parts.popleft())
|
||||||
|
# Crit
|
||||||
|
crit = int(parts.popleft())
|
||||||
|
|
||||||
|
units = ''
|
||||||
|
if include_units:
|
||||||
|
if len(parts):
|
||||||
|
# units
|
||||||
|
units = parts.popleft()
|
||||||
|
elif units_required:
|
||||||
|
raise ValueError("Missing units in {}".format(spec))
|
||||||
|
|
||||||
|
if len(parts) != 0:
|
||||||
|
raise ValueError("Too many threshold specifiers in {}".format(spec))
|
||||||
|
|
||||||
|
return ThresholdSpec(warn=warn, crit=crit, units=units)
|
||||||
|
|
||||||
|
|
||||||
|
def pretty_time(seconds):
|
||||||
|
remainder = seconds
|
||||||
|
result = []
|
||||||
|
if remainder > 24 * 60 * 60:
|
||||||
|
days, remainder = divmod(remainder, 24 * 60 * 60)
|
||||||
|
result.append("{}d".format(int(days)))
|
||||||
|
if remainder > 60 * 60:
|
||||||
|
hours, remainder = divmod(remainder, 60 * 60)
|
||||||
|
result.append("{}h".format(int(hours)))
|
||||||
|
if remainder > 60:
|
||||||
|
minutes, remainder = divmod(remainder, 60)
|
||||||
|
result.append("{}min".format(int(minutes)))
|
||||||
|
result.append("{}s".format(int(remainder)))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_numeric_thresholds(container, value, thresholds, name, short_name,
|
||||||
|
min=None, max=None, greater_than=True):
|
||||||
|
rounder = lambda x: round(x, 2)
|
||||||
|
|
||||||
|
INTEGER_UNITS = ['B', '%', '']
|
||||||
|
|
||||||
|
# Some units don't have decimal places
|
||||||
|
rounded_value = int(value) if thresholds.units in INTEGER_UNITS else rounder(value)
|
||||||
|
|
||||||
|
perf_string = "{container}_{short_name}={value}{units};{warn};{crit}".format(
|
||||||
|
container=container,
|
||||||
|
short_name=short_name,
|
||||||
|
value=rounded_value,
|
||||||
|
**thresholds)
|
||||||
|
if min is not None:
|
||||||
|
rounded_min = math.floor(min) if thresholds.units in INTEGER_UNITS else rounder(min)
|
||||||
|
perf_string += ';{}'.format(rounded_min)
|
||||||
|
if max is not None:
|
||||||
|
rounded_max = math.ceil(max) if thresholds.units in INTEGER_UNITS else rounder(max)
|
||||||
|
perf_string += ';{}'.format(rounded_max)
|
||||||
|
|
||||||
|
global performance_data
|
||||||
|
performance_data.append(perf_string)
|
||||||
|
|
||||||
|
if thresholds.units == 's':
|
||||||
|
nice_time = ' '.join(pretty_time(rounded_value)[:2])
|
||||||
|
results_str = "{} {} is {}".format(container, name, nice_time)
|
||||||
|
else:
|
||||||
|
results_str = "{} {} is {}{}".format(container, name, rounded_value, thresholds.units)
|
||||||
|
|
||||||
|
if greater_than:
|
||||||
|
comparator = lambda value, threshold: value >= threshold
|
||||||
|
else:
|
||||||
|
comparator = lambda value, threshold: value <= threshold
|
||||||
|
|
||||||
|
if comparator(value, thresholds.crit):
|
||||||
|
critical(results_str)
|
||||||
|
elif comparator(value, thresholds.warn):
|
||||||
|
warning(results_str)
|
||||||
|
else:
|
||||||
|
ok(results_str)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def get_url(url):
|
||||||
|
logger.debug("get_url: {}".format(url))
|
||||||
|
response = better_urllib_get.open(url, timeout=timeout)
|
||||||
|
logger.debug("get_url: {} {}".format(url, response.status))
|
||||||
|
return process_urllib_response(response), response.status
|
||||||
|
|
||||||
|
|
||||||
|
def process_urllib_response(response):
|
||||||
|
response_bytes = response.read()
|
||||||
|
body = response_bytes.decode('utf-8')
|
||||||
|
# logger.debug("BODY: {}".format(body))
|
||||||
|
return json.loads(body)
|
||||||
|
|
||||||
|
|
||||||
|
def get_container_info(name):
|
||||||
|
content, _ = get_url(daemon + '/containers/{container}/json'.format(container=name))
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def get_image_info(name):
|
||||||
|
content, _ = get_url(daemon + '/images/{image}/json'.format(image=name))
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def get_state(container):
|
||||||
|
return get_container_info(container)['State']
|
||||||
|
|
||||||
|
|
||||||
|
def get_stats(container):
|
||||||
|
content, _ = get_url(daemon + '/containers/{container}/stats?stream=0'.format(container=container))
|
||||||
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def get_ps_name(name_list):
|
||||||
|
# Pick the name that starts with a '/' but doesn't contain a '/' and return that value
|
||||||
|
for name in name_list:
|
||||||
|
if '/' not in name[1:] and name[0] == '/':
|
||||||
|
return name[1:]
|
||||||
|
else:
|
||||||
|
raise NameError("Error when trying to identify 'ps' name in {}".format(name_list))
|
||||||
|
|
||||||
|
|
||||||
|
def get_containers(names, require_present):
|
||||||
|
containers_list, _ = get_url(daemon + '/containers/json?all=1')
|
||||||
|
|
||||||
|
all_container_names = set(get_ps_name(x['Names']) for x in containers_list)
|
||||||
|
|
||||||
|
if 'all' in names:
|
||||||
|
return all_container_names
|
||||||
|
|
||||||
|
filtered = set()
|
||||||
|
for matcher in names:
|
||||||
|
found = False
|
||||||
|
for candidate in all_container_names:
|
||||||
|
if re.match("^{}$".format(matcher), candidate):
|
||||||
|
filtered.add(candidate)
|
||||||
|
found = True
|
||||||
|
# If we don't find a container that matches out regex
|
||||||
|
if require_present and not found:
|
||||||
|
critical("No containers match {}".format(matcher))
|
||||||
|
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
|
def get_container_image_id(container):
|
||||||
|
# find registry and tag
|
||||||
|
inspection = get_container_info(container)
|
||||||
|
return inspection['Image']
|
||||||
|
|
||||||
|
|
||||||
|
def get_container_image_urls(container):
|
||||||
|
inspection = get_container_info(container)
|
||||||
|
image_id = inspection['Image']
|
||||||
|
image_info = get_image_info(image_id)
|
||||||
|
return image_info['RepoTags']
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_image_name_to_manifest_url(image_name, insecure_registries):
|
||||||
|
parsed_url = parse_image_name(image_name)
|
||||||
|
|
||||||
|
lower_insecure = [reg.lower() for reg in insecure_registries]
|
||||||
|
|
||||||
|
# Registry query url
|
||||||
|
scheme = 'http' if parsed_url.registry.lower() in lower_insecure else 'https'
|
||||||
|
url = '{scheme}://{registry}/v2/{image_name}/manifests/{image_tag}'.format(scheme=scheme,
|
||||||
|
registry=parsed_url.registry,
|
||||||
|
image_name=parsed_url.name,
|
||||||
|
image_tag=parsed_url.tag)
|
||||||
|
return url, parsed_url.registry
|
||||||
|
|
||||||
|
|
||||||
|
# Auth servers seem picky about being hit too hard. Can't figure out why. ;)
|
||||||
|
# As result it is best to single thread this check
|
||||||
|
# This is based on https://docs.docker.com/registry/spec/auth/token/#requesting-a-token
|
||||||
|
def get_digest_from_registry(url):
|
||||||
|
logger.debug("get_digest_from_registry")
|
||||||
|
# query registry
|
||||||
|
# TODO: Handle logging in if needed
|
||||||
|
registry_info, status_code = get_url(url=url)
|
||||||
|
|
||||||
|
if status_code != 200:
|
||||||
|
raise RegistryError(response=registry_info)
|
||||||
|
return registry_info['config'].get('digest', None)
|
||||||
|
|
||||||
|
|
||||||
|
def set_rc(new_rc):
|
||||||
|
global rc
|
||||||
|
rc = new_rc if new_rc > rc else rc
|
||||||
|
|
||||||
|
|
||||||
|
def ok(message):
|
||||||
|
set_rc(OK_RC)
|
||||||
|
messages.append('OK: ' + message)
|
||||||
|
|
||||||
|
|
||||||
|
def warning(message):
|
||||||
|
set_rc(WARNING_RC)
|
||||||
|
messages.append('WARNING: ' + message)
|
||||||
|
|
||||||
|
|
||||||
|
def critical(message):
|
||||||
|
set_rc(CRITICAL_RC)
|
||||||
|
messages.append('CRITICAL: ' + message)
|
||||||
|
|
||||||
|
|
||||||
|
def unknown(message):
|
||||||
|
set_rc(UNKNOWN_RC)
|
||||||
|
messages.append('UNKNOWN: ' + message)
|
||||||
|
|
||||||
|
|
||||||
|
def require_running(name):
|
||||||
|
def inner_decorator(func):
|
||||||
|
def wrapper(container, *args, **kwargs):
|
||||||
|
container_state = get_state(container)
|
||||||
|
state = normalize_state(container_state)
|
||||||
|
if state.lower() == "running":
|
||||||
|
func(container, *args, **kwargs)
|
||||||
|
else:
|
||||||
|
# container is not running, can't perform check
|
||||||
|
critical('{container} is not "running", cannot check {check}"'.format(container=container,
|
||||||
|
check=name))
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
return inner_decorator
|
||||||
|
|
||||||
|
|
||||||
|
def multithread_execution(disable_threading=DISABLE_THREADING):
|
||||||
|
def inner_decorator(func):
|
||||||
|
def wrapper(container, *args, **kwargs):
|
||||||
|
if DISABLE_THREADING:
|
||||||
|
func(container, *args, **kwargs)
|
||||||
|
else:
|
||||||
|
threads.append(parallel_executor.submit(func, container, *args, **kwargs))
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
return inner_decorator
|
||||||
|
|
||||||
|
|
||||||
|
def singlethread_execution(disable_threading=DISABLE_THREADING):
|
||||||
|
def inner_decorator(func):
|
||||||
|
def wrapper(container, *args, **kwargs):
|
||||||
|
if DISABLE_THREADING:
|
||||||
|
func(container, *args, **kwargs)
|
||||||
|
else:
|
||||||
|
threads.append(serial_executor.submit(func, container, *args, **kwargs))
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
return inner_decorator
|
||||||
|
|
||||||
|
|
||||||
|
def parse_image_name(image_name):
|
||||||
|
"""
|
||||||
|
Parses image names into their constituent parts.
|
||||||
|
:param image_name:
|
||||||
|
:return: ImageName
|
||||||
|
"""
|
||||||
|
|
||||||
|
# These are based on information found here
|
||||||
|
# https://docs.docker.com/engine/reference/commandline/tag/#extended-description
|
||||||
|
# https://github.com/docker/distribution/blob/master/reference/regexp.go
|
||||||
|
host_segment_re = '[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?'
|
||||||
|
hostname_re = r'({host_segment}\.)+{host_segment}'.format(host_segment=host_segment_re)
|
||||||
|
registry_re = r'((?P<registry>({hostname_re}(:\d+)?|{host_segment_re}:\d+))/)'.format(
|
||||||
|
host_segment_re=host_segment_re, hostname_re=hostname_re)
|
||||||
|
name_component_ends_re = '[a-z0-9]'
|
||||||
|
name_component_middle_re = '[a-z0-9._-]' # Ignoring spec limit of two _
|
||||||
|
name_component_re = '({end}{middle}*{end}|{end})'.format(end=name_component_ends_re,
|
||||||
|
middle=name_component_middle_re)
|
||||||
|
image_name_re = "(?P<image_name>({name_component}/)*{name_component})".format(name_component=name_component_re)
|
||||||
|
image_tag_re = '(?P<image_tag>[a-zA-Z0-9_][a-zA-Z0-9_.-]*)'
|
||||||
|
full_re = '^{registry}?{image_name}(:{image_tag})?$'.format(registry=registry_re, image_name=image_name_re,
|
||||||
|
image_tag=image_tag_re)
|
||||||
|
parsed = re.match(full_re, image_name)
|
||||||
|
|
||||||
|
registry = parsed.group('registry') if parsed.group('registry') else DEFAULT_PUBLIC_REGISTRY
|
||||||
|
|
||||||
|
image_name = parsed.group('image_name')
|
||||||
|
image_name = image_name if '/' in image_name or registry != DEFAULT_PUBLIC_REGISTRY else 'library/' + image_name
|
||||||
|
|
||||||
|
image_tag = parsed.group('image_tag')
|
||||||
|
image_tag = image_tag if image_tag else 'latest'
|
||||||
|
|
||||||
|
full_image_name = "{registry}/{image_name}:{image_tag}".format(
|
||||||
|
registry=registry,
|
||||||
|
image_name=image_name,
|
||||||
|
image_tag=image_tag)
|
||||||
|
|
||||||
|
return ImageName(registry=registry, name=image_name, tag=image_tag, full_name=full_image_name)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_state(status_info):
|
||||||
|
# Ugh, docker used to report state in as silly way then they figured out how to do it better.
|
||||||
|
# This tries the simpler new way and if that doesn't work fails back to the old way
|
||||||
|
|
||||||
|
# On new docker engines the status holds whatever the current state is, running, stopped, paused, etc.
|
||||||
|
if "Status" in status_info:
|
||||||
|
return status_info['Status']
|
||||||
|
|
||||||
|
status = 'Exited'
|
||||||
|
if status_info["Restarting"]:
|
||||||
|
status = 'Restarting'
|
||||||
|
elif status_info["Paused"]:
|
||||||
|
status = 'Paused'
|
||||||
|
elif status_info["Dead"]:
|
||||||
|
status = 'Dead'
|
||||||
|
elif status_info["Running"]:
|
||||||
|
return "Running"
|
||||||
|
return status
|
||||||
|
|
||||||
|
|
||||||
|
# Checks
|
||||||
|
#############################################################################################
|
||||||
|
|
||||||
|
@multithread_execution()
|
||||||
|
@require_running(name='memory')
|
||||||
|
def check_memory(container, thresholds):
|
||||||
|
if not thresholds.units in unit_adjustments:
|
||||||
|
unknown("Memory units must be one of {}".format(list(unit_adjustments.keys())))
|
||||||
|
return
|
||||||
|
|
||||||
|
inspection = get_stats(container)
|
||||||
|
|
||||||
|
# Subtracting cache to match what `docker stats` does.
|
||||||
|
adjusted_usage = inspection['memory_stats']['usage'] - inspection['memory_stats']['stats']['total_cache']
|
||||||
|
if thresholds.units == '%':
|
||||||
|
max = 100
|
||||||
|
usage = int(100 * adjusted_usage / inspection['memory_stats']['limit'])
|
||||||
|
else:
|
||||||
|
max = inspection['memory_stats']['limit'] / unit_adjustments[thresholds.units]
|
||||||
|
usage = adjusted_usage / unit_adjustments[thresholds.units]
|
||||||
|
|
||||||
|
evaluate_numeric_thresholds(container=container, value=usage, thresholds=thresholds, name='memory',
|
||||||
|
short_name='mem', min=0, max=max)
|
||||||
|
|
||||||
|
|
||||||
|
@multithread_execution()
|
||||||
|
def check_status(container, desired_state):
|
||||||
|
normized_desired_state = desired_state.lower()
|
||||||
|
normalized_state = normalize_state(get_state(container)).lower()
|
||||||
|
if normized_desired_state != normalized_state:
|
||||||
|
critical("{} state is not {}".format(container, desired_state))
|
||||||
|
return
|
||||||
|
ok("{} status is {}".format(container, desired_state))
|
||||||
|
|
||||||
|
|
||||||
|
@multithread_execution()
|
||||||
|
@require_running('health')
|
||||||
|
def check_health(container):
|
||||||
|
state = get_state(container)
|
||||||
|
if "Health" in state and "Status" in state["Health"]:
|
||||||
|
health = state["Health"]["Status"]
|
||||||
|
message = "{} is {}".format(container, health)
|
||||||
|
if health == 'healthy':
|
||||||
|
ok(message)
|
||||||
|
elif health == 'unhealthy':
|
||||||
|
critical(message)
|
||||||
|
else:
|
||||||
|
unknown(message)
|
||||||
|
else:
|
||||||
|
unknown('{} has no health check data'.format(container))
|
||||||
|
|
||||||
|
|
||||||
|
@multithread_execution()
|
||||||
|
@require_running('uptime')
|
||||||
|
def check_uptime(container, thresholds):
|
||||||
|
inspection = get_container_info(container)['State']['StartedAt']
|
||||||
|
only_secs = inspection[0:19]
|
||||||
|
start = datetime.strptime(only_secs, "%Y-%m-%dT%H:%M:%S")
|
||||||
|
start = start.replace(tzinfo=timezone.utc)
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
uptime = (now - start).total_seconds()
|
||||||
|
|
||||||
|
graph_padding = 2
|
||||||
|
thresholds.units = 's'
|
||||||
|
evaluate_numeric_thresholds(container=container, value=uptime, thresholds=thresholds, name='uptime',
|
||||||
|
short_name='up', min=0, max=graph_padding, greater_than=False)
|
||||||
|
|
||||||
|
|
||||||
|
@multithread_execution()
|
||||||
|
def check_image_age(container, thresholds):
|
||||||
|
container_image = get_container_info(container)['Image']
|
||||||
|
image_created = get_image_info(container_image)['Created']
|
||||||
|
only_secs = image_created[0:19]
|
||||||
|
start = datetime.strptime(only_secs, "%Y-%m-%dT%H:%M:%S")
|
||||||
|
start = start.replace(tzinfo=timezone.utc)
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
image_age = (now - start).days
|
||||||
|
|
||||||
|
graph_padding = 2
|
||||||
|
thresholds.units = 'd'
|
||||||
|
evaluate_numeric_thresholds(container=container, value=image_age, thresholds=thresholds, name='image_age',
|
||||||
|
short_name='age', min=0, max=graph_padding, greater_than=True)
|
||||||
|
|
||||||
|
|
||||||
|
@multithread_execution()
|
||||||
|
@require_running('restarts')
|
||||||
|
def check_restarts(container, thresholds):
|
||||||
|
inspection = get_container_info(container)
|
||||||
|
|
||||||
|
restarts = int(inspection['RestartCount'])
|
||||||
|
graph_padding = 2
|
||||||
|
evaluate_numeric_thresholds(container=container, value=restarts, thresholds=thresholds, name='restarts',
|
||||||
|
short_name='re', min=0, max=graph_padding)
|
||||||
|
|
||||||
|
|
||||||
|
@singlethread_execution()
|
||||||
|
def check_version(container, insecure_registries):
|
||||||
|
image_id = get_container_image_id(container)
|
||||||
|
logger.debug("Local container image ID: {}".format(image_id))
|
||||||
|
if image_id is None:
|
||||||
|
unknown('Checksum missing for "{}", try doing a pull'.format(container))
|
||||||
|
return
|
||||||
|
|
||||||
|
image_urls = get_container_image_urls(container=container)
|
||||||
|
if len(image_urls) > 1:
|
||||||
|
unknown('"{}" has multiple tags/names. Unsure which one to use to check the version.'.format(container))
|
||||||
|
return
|
||||||
|
elif len(image_urls) == 0:
|
||||||
|
unknown('"{}" has last no repository tag. Is this anywhere else?'.format(container))
|
||||||
|
return
|
||||||
|
|
||||||
|
url, registry = normalize_image_name_to_manifest_url(image_urls[0], insecure_registries)
|
||||||
|
logger.debug("Looking up image digest here {}".format(url))
|
||||||
|
try:
|
||||||
|
registry_hash = get_digest_from_registry(url)
|
||||||
|
except URLError as e:
|
||||||
|
if hasattr(e.reason, 'reason') and e.reason.reason == 'UNKNOWN_PROTOCOL':
|
||||||
|
unknown(
|
||||||
|
"TLS error connecting to registry {} for {}, should you use the '--insecure-registry' flag?" \
|
||||||
|
.format(registry, container))
|
||||||
|
return
|
||||||
|
elif hasattr(e.reason, 'strerror') and e.reason.strerror == 'nodename nor servname provided, or not known':
|
||||||
|
unknown(
|
||||||
|
"Cannot reach registry for {} at {}".format(container, url))
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
|
except RegistryError as e:
|
||||||
|
unknown("Cannot check version, couldn't retrieve digest for {} while checking {}.".format(container, url))
|
||||||
|
return
|
||||||
|
logger.debug("Image digests, local={} remote={}".format(image_id, registry_hash))
|
||||||
|
if registry_hash == image_id:
|
||||||
|
ok("{}'s version matches registry".format(container))
|
||||||
|
return
|
||||||
|
critical("{}'s version does not match registry".format(container))
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_cpu_capacity_precentage(info, stats):
|
||||||
|
host_config = info['HostConfig']
|
||||||
|
|
||||||
|
if 'online_cpus' in stats['cpu_stats']:
|
||||||
|
num_cpus = stats['cpu_stats']['online_cpus']
|
||||||
|
else:
|
||||||
|
num_cpus = len(stats['cpu_stats']['cpu_usage']['percpu_usage'])
|
||||||
|
|
||||||
|
# Identify limit system being used
|
||||||
|
# --cpus
|
||||||
|
if 'NanoCpus' in host_config and host_config['NanoCpus'] != 0:
|
||||||
|
period = 1000000000
|
||||||
|
quota = host_config['NanoCpus']
|
||||||
|
# --cpu-quota
|
||||||
|
elif 'CpuQuota' in host_config and host_config['CpuQuota'] != 0:
|
||||||
|
period = 100000 if host_config['CpuPeriod'] == 0 else host_config['CpuPeriod']
|
||||||
|
quota = host_config['CpuQuota']
|
||||||
|
# unlimited
|
||||||
|
else:
|
||||||
|
period = 1
|
||||||
|
quota = num_cpus
|
||||||
|
|
||||||
|
if period * num_cpus < quota:
|
||||||
|
# This handles the case where the quota is actually bigger than amount available by all the cpus.
|
||||||
|
available_limit_ratio = 1
|
||||||
|
else:
|
||||||
|
available_limit_ratio = (period * num_cpus) / quota
|
||||||
|
|
||||||
|
cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage']
|
||||||
|
system_delta = stats['cpu_stats']['system_cpu_usage'] - stats['precpu_stats']['system_cpu_usage']
|
||||||
|
usage = (cpu_delta / system_delta) * available_limit_ratio
|
||||||
|
usage = round(usage * 100, 0)
|
||||||
|
return usage
|
||||||
|
|
||||||
|
|
||||||
|
@multithread_execution()
|
||||||
|
@require_running('cpu')
|
||||||
|
def check_cpu(container, thresholds):
|
||||||
|
info = get_container_info(container)
|
||||||
|
|
||||||
|
stats = get_stats(container=container)
|
||||||
|
|
||||||
|
usage = calculate_cpu_capacity_precentage(info=info, stats=stats)
|
||||||
|
|
||||||
|
max = 100
|
||||||
|
thresholds.units = '%'
|
||||||
|
evaluate_numeric_thresholds(container=container, value=usage, thresholds=thresholds, name='cpu', short_name='cpu',
|
||||||
|
min=0, max=max)
|
||||||
|
|
||||||
|
|
||||||
|
def process_args(args):
|
||||||
|
parser = argparse.ArgumentParser(description='Check docker containers.')
|
||||||
|
|
||||||
|
# Connect to local socket or ip address
|
||||||
|
connection_group = parser.add_mutually_exclusive_group()
|
||||||
|
connection_group.add_argument('--connection',
|
||||||
|
dest='connection',
|
||||||
|
action='store',
|
||||||
|
default=DEFAULT_SOCKET,
|
||||||
|
type=str,
|
||||||
|
metavar='[/<path to>/docker.socket|<ip/host address>:<port>]',
|
||||||
|
help='Where to find docker daemon socket. (default: %(default)s)')
|
||||||
|
|
||||||
|
connection_group.add_argument('--secure-connection',
|
||||||
|
dest='secure_connection',
|
||||||
|
action='store',
|
||||||
|
type=str,
|
||||||
|
metavar='[<ip/host address>:<port>]',
|
||||||
|
help='Where to find TLS protected docker daemon socket.')
|
||||||
|
|
||||||
|
base_group = parser.add_mutually_exclusive_group()
|
||||||
|
base_group.add_argument('--binary_units',
|
||||||
|
dest='units_base',
|
||||||
|
action='store_const',
|
||||||
|
const=1024,
|
||||||
|
help='Use a base of 1024 when doing calculations of KB, MB, GB, & TB (This is default)')
|
||||||
|
|
||||||
|
base_group.add_argument('--decimal_units',
|
||||||
|
dest='units_base',
|
||||||
|
action='store_const',
|
||||||
|
const=1000,
|
||||||
|
help='Use a base of 1000 when doing calculations of KB, MB, GB, & TB')
|
||||||
|
parser.set_defaults(units_base=1024)
|
||||||
|
|
||||||
|
# Connection timeout
|
||||||
|
parser.add_argument('--timeout',
|
||||||
|
dest='timeout',
|
||||||
|
action='store',
|
||||||
|
type=float,
|
||||||
|
default=DEFAULT_TIMEOUT,
|
||||||
|
help='Connection timeout in seconds. (default: %(default)s)')
|
||||||
|
|
||||||
|
# Container name
|
||||||
|
parser.add_argument('--containers',
|
||||||
|
dest='containers',
|
||||||
|
action='store',
|
||||||
|
nargs='+',
|
||||||
|
type=str,
|
||||||
|
default=['all'],
|
||||||
|
help='One or more RegEx that match the names of the container(s) to check. If omitted all containers are checked. (default: %(default)s)')
|
||||||
|
|
||||||
|
# Container name
|
||||||
|
parser.add_argument('--present',
|
||||||
|
dest='present',
|
||||||
|
default=False,
|
||||||
|
action='store_true',
|
||||||
|
help='Modifies --containers so that each RegEx must match at least one container.')
|
||||||
|
|
||||||
|
# Threads
|
||||||
|
parser.add_argument('--threads',
|
||||||
|
dest='threads',
|
||||||
|
default=DEFAULT_PARALLELISM,
|
||||||
|
action='store',
|
||||||
|
type=int,
|
||||||
|
help='This + 1 is the maximum number of concurent threads/network connections. (default: %(default)s)')
|
||||||
|
|
||||||
|
# CPU
|
||||||
|
parser.add_argument('--cpu',
|
||||||
|
dest='cpu',
|
||||||
|
action='store',
|
||||||
|
type=str,
|
||||||
|
metavar='WARN:CRIT',
|
||||||
|
help='Check cpu usage percentage taking into account any limits.')
|
||||||
|
|
||||||
|
# Memory
|
||||||
|
parser.add_argument('--memory',
|
||||||
|
dest='memory',
|
||||||
|
action='store',
|
||||||
|
type=str,
|
||||||
|
metavar='WARN:CRIT:UNITS',
|
||||||
|
help='Check memory usage taking into account any limits. Valid values for units are %%,B,KB,MB,GB.')
|
||||||
|
|
||||||
|
# State
|
||||||
|
parser.add_argument('--status',
|
||||||
|
dest='status',
|
||||||
|
action='store',
|
||||||
|
type=str,
|
||||||
|
help='Desired container status (running, exited, etc).')
|
||||||
|
|
||||||
|
# Health
|
||||||
|
parser.add_argument('--health',
|
||||||
|
dest='health',
|
||||||
|
default=None,
|
||||||
|
action='store_true',
|
||||||
|
help="Check container's health check status")
|
||||||
|
|
||||||
|
# Age
|
||||||
|
parser.add_argument('--uptime',
|
||||||
|
dest='uptime',
|
||||||
|
action='store',
|
||||||
|
type=str,
|
||||||
|
metavar='WARN:CRIT',
|
||||||
|
help='Minimum container uptime in seconds. Use when infrequent crashes are tolerated.')
|
||||||
|
|
||||||
|
# Image Age
|
||||||
|
parser.add_argument('--image-age',
|
||||||
|
dest='image_age',
|
||||||
|
action='store',
|
||||||
|
type=str,
|
||||||
|
metavar='WARN:CRIT',
|
||||||
|
help='Maximum image age in days.')
|
||||||
|
|
||||||
|
# Version
|
||||||
|
parser.add_argument('--version',
|
||||||
|
dest='version',
|
||||||
|
default=None,
|
||||||
|
action='store_true',
|
||||||
|
help='Check if the running images are the same version as those in the registry. Useful for finding stale images. Does not support login.')
|
||||||
|
|
||||||
|
# Version
|
||||||
|
parser.add_argument('--insecure-registries',
|
||||||
|
dest='insecure_registries',
|
||||||
|
action='store',
|
||||||
|
nargs='+',
|
||||||
|
type=str,
|
||||||
|
default=[],
|
||||||
|
help='List of registries to connect to with http(no TLS). Useful when using "--version" with images from insecure registries.')
|
||||||
|
|
||||||
|
# Restart
|
||||||
|
parser.add_argument('--restarts',
|
||||||
|
dest='restarts',
|
||||||
|
action='store',
|
||||||
|
type=str,
|
||||||
|
metavar='WARN:CRIT',
|
||||||
|
help='Container restart thresholds.')
|
||||||
|
|
||||||
|
# no-ok
|
||||||
|
parser.add_argument('--no-ok',
|
||||||
|
dest='no_ok',
|
||||||
|
action='store_true',
|
||||||
|
help='Make output terse suppressing OK messages. If all checks are OK return a single OK.')
|
||||||
|
|
||||||
|
# no-performance
|
||||||
|
parser.add_argument('--no-performance',
|
||||||
|
dest='no_performance',
|
||||||
|
action='store_true',
|
||||||
|
help='Suppress performance data. Reduces output when performance data is not being used.')
|
||||||
|
|
||||||
|
parser.add_argument('-V', action='version', version='%(prog)s {}'.format(__version__))
|
||||||
|
|
||||||
|
if len(args) == 0:
|
||||||
|
parser.print_help()
|
||||||
|
|
||||||
|
parsed_args = parser.parse_args(args=args)
|
||||||
|
|
||||||
|
global timeout
|
||||||
|
timeout = parsed_args.timeout
|
||||||
|
|
||||||
|
global daemon
|
||||||
|
global connection_type
|
||||||
|
if parsed_args.secure_connection:
|
||||||
|
daemon = 'https://' + parsed_args.secure_connection
|
||||||
|
connection_type = 'https'
|
||||||
|
elif parsed_args.connection:
|
||||||
|
if parsed_args.connection[0] == '/':
|
||||||
|
daemon = 'socket://' + parsed_args.connection + ':'
|
||||||
|
connection_type = 'socket'
|
||||||
|
else:
|
||||||
|
daemon = 'http://' + parsed_args.connection
|
||||||
|
connection_type = 'http'
|
||||||
|
|
||||||
|
return parsed_args
|
||||||
|
|
||||||
|
|
||||||
|
def no_checks_present(parsed_args):
|
||||||
|
# Look for all functions whose name starts with 'check_'
|
||||||
|
checks = [key[6:] for key in globals().keys() if key.startswith('check_')]
|
||||||
|
# Act like --present is a check though it is not implemented like one
|
||||||
|
return all(getattr(parsed_args, check) is None for check in checks) and not parsed_args.present
|
||||||
|
|
||||||
|
|
||||||
|
def socketfile_permissions_failure(parsed_args):
|
||||||
|
if connection_type == 'socket':
|
||||||
|
return not (os.path.exists(parsed_args.connection)
|
||||||
|
and stat.S_ISSOCK(os.stat(parsed_args.connection).st_mode)
|
||||||
|
and os.access(parsed_args.connection, os.R_OK)
|
||||||
|
and os.access(parsed_args.connection, os.W_OK))
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def print_results():
|
||||||
|
if no_ok:
|
||||||
|
# Remove all the "OK"s
|
||||||
|
filtered_messages = [message for message in messages if not message.startswith('OK: ')]
|
||||||
|
if len(filtered_messages) == 0:
|
||||||
|
messages_concat = 'OK'
|
||||||
|
else:
|
||||||
|
messages_concat = '; '.join(filtered_messages)
|
||||||
|
|
||||||
|
else:
|
||||||
|
messages_concat = '; '.join(messages)
|
||||||
|
|
||||||
|
if no_performance or len(performance_data) == 0:
|
||||||
|
print(messages_concat)
|
||||||
|
else:
|
||||||
|
perfdata_concat = ' '.join(performance_data)
|
||||||
|
print(messages_concat + '|' + perfdata_concat)
|
||||||
|
|
||||||
|
|
||||||
|
def perform_checks(raw_args):
|
||||||
|
args = process_args(raw_args)
|
||||||
|
|
||||||
|
global parallel_executor
|
||||||
|
parallel_executor = futures.ThreadPoolExecutor(max_workers=args.threads)
|
||||||
|
global serial_executor
|
||||||
|
serial_executor = futures.ThreadPoolExecutor(max_workers=1)
|
||||||
|
|
||||||
|
global unit_adjustments
|
||||||
|
unit_adjustments = {key: args.units_base ** value for key, value in UNIT_ADJUSTMENTS_TEMPLATE.items()}
|
||||||
|
|
||||||
|
global no_ok
|
||||||
|
no_ok = args.no_ok
|
||||||
|
|
||||||
|
global no_performance
|
||||||
|
no_performance = args.no_ok
|
||||||
|
|
||||||
|
if socketfile_permissions_failure(args):
|
||||||
|
unknown("Cannot access docker socket file. User ID={}, socket file={}".format(os.getuid(), args.connection))
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.containers == ["all"] and args.present:
|
||||||
|
unknown("You can not use --present without --containers")
|
||||||
|
return
|
||||||
|
|
||||||
|
if no_checks_present(args):
|
||||||
|
unknown("No checks specified.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Here is where all the work happens
|
||||||
|
#############################################################################################
|
||||||
|
containers = get_containers(args.containers, args.present)
|
||||||
|
|
||||||
|
if len(containers) == 0 and not args.present:
|
||||||
|
unknown("No containers names found matching criteria")
|
||||||
|
return
|
||||||
|
|
||||||
|
for container in containers:
|
||||||
|
|
||||||
|
# Check status
|
||||||
|
if args.status:
|
||||||
|
check_status(container, args.status)
|
||||||
|
|
||||||
|
# Check version
|
||||||
|
if args.version:
|
||||||
|
check_version(container, args.insecure_registries)
|
||||||
|
|
||||||
|
# below are checks that require a 'running' status
|
||||||
|
|
||||||
|
# Check status
|
||||||
|
if args.health:
|
||||||
|
check_health(container)
|
||||||
|
|
||||||
|
# Check cpu usage
|
||||||
|
if args.cpu:
|
||||||
|
check_cpu(container, parse_thresholds(args.cpu, units_required=False))
|
||||||
|
|
||||||
|
# Check memory usage
|
||||||
|
if args.memory:
|
||||||
|
check_memory(container, parse_thresholds(args.memory, units_required=False))
|
||||||
|
|
||||||
|
# Check uptime
|
||||||
|
if args.uptime:
|
||||||
|
check_uptime(container, parse_thresholds(args.uptime, include_units=False))
|
||||||
|
|
||||||
|
# Check image age
|
||||||
|
if args.image_age:
|
||||||
|
check_image_age(container, parse_thresholds(args.image_age, include_units=False))
|
||||||
|
|
||||||
|
# Check restart count
|
||||||
|
if args.restarts:
|
||||||
|
check_restarts(container, parse_thresholds(args.restarts, include_units=False))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
try:
|
||||||
|
perform_checks(argv[1:])
|
||||||
|
|
||||||
|
# get results to let exceptions in threads bubble out
|
||||||
|
[x.result() for x in futures.as_completed(threads)]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
traceback.print_exc()
|
||||||
|
unknown("Exception raised during check': {}".format(repr(e)))
|
||||||
|
print_results()
|
||||||
|
exit(rc)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user