Compare commits
2 Commits
83081b757b
...
4d63d9f3e9
Author | SHA1 | Date | |
---|---|---|---|
4d63d9f3e9 | |||
dee3bce537 |
996
check_docker
Executable file
996
check_docker
Executable file
@ -0,0 +1,996 @@
|
||||
#!/usr/bin/env python3
|
||||
# logging.basicConfig(level=logging.DEBUG)
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import socket
|
||||
import stat
|
||||
import traceback
|
||||
from collections import deque, namedtuple, UserDict, defaultdict
|
||||
from concurrent import futures
|
||||
from datetime import datetime, timezone
|
||||
from functools import lru_cache
|
||||
from http.client import HTTPConnection
|
||||
from sys import argv
|
||||
from urllib import request
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.request import AbstractHTTPHandler, HTTPHandler, HTTPSHandler, OpenerDirector, HTTPRedirectHandler, \
|
||||
Request, HTTPBasicAuthHandler
|
||||
|
||||
logger = logging.getLogger()
|
||||
__author__ = 'Tim Laurence'
|
||||
__copyright__ = "Copyright 2019"
|
||||
__credits__ = ['Tim Laurence']
|
||||
__license__ = "GPL"
|
||||
__version__ = "2.2.2"
|
||||
|
||||
'''
|
||||
nrpe compatible check for docker containers.
|
||||
|
||||
Requires Python 3
|
||||
|
||||
Note: I really would have preferred to have used requests for all the network connections but that would have added a
|
||||
dependency.
|
||||
'''
|
||||
|
||||
DEFAULT_SOCKET = '/var/run/docker.sock'
|
||||
DEFAULT_TIMEOUT = 10.0
|
||||
DEFAULT_PORT = 2375
|
||||
DEFAULT_MEMORY_UNITS = 'B'
|
||||
DEFAULT_HEADERS = [('Accept', 'application/vnd.docker.distribution.manifest.v2+json')]
|
||||
DEFAULT_PUBLIC_REGISTRY = 'registry-1.docker.io'
|
||||
|
||||
# The second value is the power to raise the base to.
|
||||
UNIT_ADJUSTMENTS_TEMPLATE = {
|
||||
'%': 0,
|
||||
'B': 0,
|
||||
'KB': 1,
|
||||
'MB': 2,
|
||||
'GB': 3,
|
||||
'TB': 4
|
||||
}
|
||||
unit_adjustments = None
|
||||
|
||||
# Reduce message to a single OK unless a checks fail.
|
||||
no_ok = False
|
||||
|
||||
# Suppress performance data reporting
|
||||
no_performance = False
|
||||
|
||||
OK_RC = 0
|
||||
WARNING_RC = 1
|
||||
CRITICAL_RC = 2
|
||||
UNKNOWN_RC = 3
|
||||
|
||||
# These hold the final results
|
||||
rc = -1
|
||||
messages = []
|
||||
performance_data = []
|
||||
|
||||
ImageName = namedtuple('ImageName', "registry name tag full_name")
|
||||
|
||||
|
||||
class ThresholdSpec(UserDict):
|
||||
def __init__(self, warn, crit, units=''):
|
||||
super().__init__(warn=warn, crit=crit, units=units)
|
||||
|
||||
def __getattr__(self, item):
|
||||
if item in ('warn', 'crit', 'units'):
|
||||
return self.data[item]
|
||||
return super().__getattr__(item)
|
||||
|
||||
|
||||
# How much threading can we do? We are generally not CPU bound so I am using this a worse case cap
|
||||
DEFAULT_PARALLELISM = 10
|
||||
|
||||
# Holds list of all threads
|
||||
threads = []
|
||||
|
||||
# This is used during testing
|
||||
DISABLE_THREADING = False
|
||||
|
||||
|
||||
# Hacked up urllib to handle sockets
|
||||
#############################################################################################
|
||||
# Docker runs a http connection over a socket. http.client is knows how to deal with these
|
||||
# but lacks some niceties. Urllib wraps that and makes up for some of the deficiencies but
|
||||
# cannot fix the fact http.client can't read from socket files. In order to take advantage of
|
||||
# urllib and http.client's capabilities the class below tweaks HttpConnection and passes it
|
||||
# to urllib registering for socket:// connections
|
||||
|
||||
# This is all side effect so excluding coverage
|
||||
class SocketFileHandler(AbstractHTTPHandler):
|
||||
class SocketFileToHttpConnectionAdaptor(HTTPConnection): # pragma: no cover
|
||||
def __init__(self, socket_file, timeout=DEFAULT_TIMEOUT):
|
||||
super().__init__(host='', port=0, timeout=timeout)
|
||||
self.socket_file = socket_file
|
||||
|
||||
def connect(self):
|
||||
self.sock = socket.socket(family=socket.AF_UNIX, type=socket.SOCK_STREAM, proto=0, fileno=None)
|
||||
self.sock.settimeout(self.timeout)
|
||||
self.sock.connect(self.socket_file)
|
||||
|
||||
def socket_open(self, req):
|
||||
socket_file, path = req.selector.split(':', 1)
|
||||
req.host = socket_file
|
||||
req.selector = path
|
||||
return self.do_open(self.SocketFileToHttpConnectionAdaptor, req)
|
||||
|
||||
|
||||
# Tokens are not cached because I expect the callers to cache the responses
|
||||
class Oauth2TokenAuthHandler(HTTPBasicAuthHandler):
|
||||
auth_failure_tracker = defaultdict(int)
|
||||
|
||||
def http_response(self, request, response):
|
||||
code, hdrs = response.code, response.headers
|
||||
|
||||
www_authenticate_header = response.headers.get('www-authenticate', None)
|
||||
if code == 401 and www_authenticate_header:
|
||||
scheme = www_authenticate_header.split()[0]
|
||||
if scheme.lower() == 'bearer':
|
||||
return self.process_oauth2(request, response, www_authenticate_header)
|
||||
|
||||
return response
|
||||
|
||||
https_response = http_response
|
||||
|
||||
@staticmethod
|
||||
def _get_outh2_token(www_authenticate_header):
|
||||
auth_fields = dict(re.findall(r"""(?:(?P<key>[^ ,=]+)="([^"]+)")""", www_authenticate_header))
|
||||
|
||||
auth_url = "{realm}?scope={scope}&service={service}".format(
|
||||
realm=auth_fields['realm'],
|
||||
scope=auth_fields['scope'],
|
||||
service=auth_fields['service'],
|
||||
)
|
||||
token_request = Request(auth_url)
|
||||
token_request.add_header("Content-Type", "application/x-www-form-urlencoded; charset=utf-8")
|
||||
token_response = request.urlopen(token_request)
|
||||
return process_urllib_response(token_response)['token']
|
||||
|
||||
def process_oauth2(self, request, response, www_authenticate_header):
|
||||
|
||||
# This keeps infinite auth loops from happening
|
||||
full_url = request.full_url
|
||||
self.auth_failure_tracker[full_url] += 1
|
||||
if self.auth_failure_tracker[full_url] > 1:
|
||||
raise HTTPError(full_url, 401, "Stopping Oauth2 failure loop for {}".format(full_url),
|
||||
response.headers, response)
|
||||
|
||||
auth_token = self._get_outh2_token(www_authenticate_header)
|
||||
|
||||
request.add_unredirected_header('Authorization', 'Bearer ' + auth_token)
|
||||
return self.parent.open(request, timeout=request.timeout)
|
||||
|
||||
|
||||
better_urllib_get = OpenerDirector()
|
||||
better_urllib_get.addheaders = DEFAULT_HEADERS.copy()
|
||||
better_urllib_get.add_handler(HTTPHandler())
|
||||
better_urllib_get.add_handler(HTTPSHandler())
|
||||
better_urllib_get.add_handler(HTTPRedirectHandler())
|
||||
better_urllib_get.add_handler(SocketFileHandler())
|
||||
better_urllib_get.add_handler(Oauth2TokenAuthHandler())
|
||||
|
||||
|
||||
class RegistryError(Exception):
|
||||
def __init__(self, response):
|
||||
self.response_obj = response
|
||||
|
||||
|
||||
# Util functions
|
||||
#############################################################################################
|
||||
def parse_thresholds(spec, include_units=True, units_required=True):
|
||||
"""
|
||||
Given a spec string break it up into ':' separated chunks. Convert strings to ints as it makes sense
|
||||
|
||||
:param spec: The threshold specification being parsed
|
||||
:param include_units: Specifies that units should be processed and returned if present
|
||||
:param units_required: Mark spec as invalid if the units are missing.
|
||||
:return: A list containing the thresholds in order of warn, crit, and units(if included and present)
|
||||
"""
|
||||
parts = deque(spec.split(':'))
|
||||
if not all(parts):
|
||||
raise ValueError("Blanks are not allowed in a threshold specification: {}".format(spec))
|
||||
|
||||
# Warn
|
||||
warn = int(parts.popleft())
|
||||
# Crit
|
||||
crit = int(parts.popleft())
|
||||
|
||||
units = ''
|
||||
if include_units:
|
||||
if len(parts):
|
||||
# units
|
||||
units = parts.popleft()
|
||||
elif units_required:
|
||||
raise ValueError("Missing units in {}".format(spec))
|
||||
|
||||
if len(parts) != 0:
|
||||
raise ValueError("Too many threshold specifiers in {}".format(spec))
|
||||
|
||||
return ThresholdSpec(warn=warn, crit=crit, units=units)
|
||||
|
||||
|
||||
def pretty_time(seconds):
|
||||
remainder = seconds
|
||||
result = []
|
||||
if remainder > 24 * 60 * 60:
|
||||
days, remainder = divmod(remainder, 24 * 60 * 60)
|
||||
result.append("{}d".format(int(days)))
|
||||
if remainder > 60 * 60:
|
||||
hours, remainder = divmod(remainder, 60 * 60)
|
||||
result.append("{}h".format(int(hours)))
|
||||
if remainder > 60:
|
||||
minutes, remainder = divmod(remainder, 60)
|
||||
result.append("{}min".format(int(minutes)))
|
||||
result.append("{}s".format(int(remainder)))
|
||||
return result
|
||||
|
||||
|
||||
def evaluate_numeric_thresholds(container, value, thresholds, name, short_name,
|
||||
min=None, max=None, greater_than=True):
|
||||
rounder = lambda x: round(x, 2)
|
||||
|
||||
INTEGER_UNITS = ['B', '%', '']
|
||||
|
||||
# Some units don't have decimal places
|
||||
rounded_value = int(value) if thresholds.units in INTEGER_UNITS else rounder(value)
|
||||
|
||||
perf_string = "{container}_{short_name}={value}{units};{warn};{crit}".format(
|
||||
container=container,
|
||||
short_name=short_name,
|
||||
value=rounded_value,
|
||||
**thresholds)
|
||||
if min is not None:
|
||||
rounded_min = math.floor(min) if thresholds.units in INTEGER_UNITS else rounder(min)
|
||||
perf_string += ';{}'.format(rounded_min)
|
||||
if max is not None:
|
||||
rounded_max = math.ceil(max) if thresholds.units in INTEGER_UNITS else rounder(max)
|
||||
perf_string += ';{}'.format(rounded_max)
|
||||
|
||||
global performance_data
|
||||
performance_data.append(perf_string)
|
||||
|
||||
if thresholds.units == 's':
|
||||
nice_time = ' '.join(pretty_time(rounded_value)[:2])
|
||||
results_str = "{} {} is {}".format(container, name, nice_time)
|
||||
else:
|
||||
results_str = "{} {} is {}{}".format(container, name, rounded_value, thresholds.units)
|
||||
|
||||
if greater_than:
|
||||
comparator = lambda value, threshold: value >= threshold
|
||||
else:
|
||||
comparator = lambda value, threshold: value <= threshold
|
||||
|
||||
if comparator(value, thresholds.crit):
|
||||
critical(results_str)
|
||||
elif comparator(value, thresholds.warn):
|
||||
warning(results_str)
|
||||
else:
|
||||
ok(results_str)
|
||||
|
||||
|
||||
@lru_cache(maxsize=None)
|
||||
def get_url(url):
|
||||
logger.debug("get_url: {}".format(url))
|
||||
response = better_urllib_get.open(url, timeout=timeout)
|
||||
logger.debug("get_url: {} {}".format(url, response.status))
|
||||
return process_urllib_response(response), response.status
|
||||
|
||||
|
||||
def process_urllib_response(response):
|
||||
response_bytes = response.read()
|
||||
body = response_bytes.decode('utf-8')
|
||||
# logger.debug("BODY: {}".format(body))
|
||||
return json.loads(body)
|
||||
|
||||
|
||||
def get_container_info(name):
|
||||
content, _ = get_url(daemon + '/containers/{container}/json'.format(container=name))
|
||||
return content
|
||||
|
||||
|
||||
def get_image_info(name):
|
||||
content, _ = get_url(daemon + '/images/{image}/json'.format(image=name))
|
||||
return content
|
||||
|
||||
|
||||
def get_state(container):
|
||||
return get_container_info(container)['State']
|
||||
|
||||
|
||||
def get_stats(container):
|
||||
content, _ = get_url(daemon + '/containers/{container}/stats?stream=0'.format(container=container))
|
||||
return content
|
||||
|
||||
|
||||
def get_ps_name(name_list):
|
||||
# Pick the name that starts with a '/' but doesn't contain a '/' and return that value
|
||||
for name in name_list:
|
||||
if '/' not in name[1:] and name[0] == '/':
|
||||
return name[1:]
|
||||
else:
|
||||
raise NameError("Error when trying to identify 'ps' name in {}".format(name_list))
|
||||
|
||||
|
||||
def get_containers(names, require_present):
|
||||
containers_list, _ = get_url(daemon + '/containers/json?all=1')
|
||||
|
||||
all_container_names = set(get_ps_name(x['Names']) for x in containers_list)
|
||||
|
||||
if 'all' in names:
|
||||
return all_container_names
|
||||
|
||||
filtered = set()
|
||||
for matcher in names:
|
||||
found = False
|
||||
for candidate in all_container_names:
|
||||
if re.match("^{}$".format(matcher), candidate):
|
||||
filtered.add(candidate)
|
||||
found = True
|
||||
# If we don't find a container that matches out regex
|
||||
if require_present and not found:
|
||||
critical("No containers match {}".format(matcher))
|
||||
|
||||
return filtered
|
||||
|
||||
|
||||
def get_container_image_id(container):
|
||||
# find registry and tag
|
||||
inspection = get_container_info(container)
|
||||
return inspection['Image']
|
||||
|
||||
|
||||
def get_container_image_urls(container):
|
||||
inspection = get_container_info(container)
|
||||
image_id = inspection['Image']
|
||||
image_info = get_image_info(image_id)
|
||||
return image_info['RepoTags']
|
||||
|
||||
|
||||
def normalize_image_name_to_manifest_url(image_name, insecure_registries):
|
||||
parsed_url = parse_image_name(image_name)
|
||||
|
||||
lower_insecure = [reg.lower() for reg in insecure_registries]
|
||||
|
||||
# Registry query url
|
||||
scheme = 'http' if parsed_url.registry.lower() in lower_insecure else 'https'
|
||||
url = '{scheme}://{registry}/v2/{image_name}/manifests/{image_tag}'.format(scheme=scheme,
|
||||
registry=parsed_url.registry,
|
||||
image_name=parsed_url.name,
|
||||
image_tag=parsed_url.tag)
|
||||
return url, parsed_url.registry
|
||||
|
||||
|
||||
# Auth servers seem picky about being hit too hard. Can't figure out why. ;)
|
||||
# As result it is best to single thread this check
|
||||
# This is based on https://docs.docker.com/registry/spec/auth/token/#requesting-a-token
|
||||
def get_digest_from_registry(url):
|
||||
logger.debug("get_digest_from_registry")
|
||||
# query registry
|
||||
# TODO: Handle logging in if needed
|
||||
registry_info, status_code = get_url(url=url)
|
||||
|
||||
if status_code != 200:
|
||||
raise RegistryError(response=registry_info)
|
||||
return registry_info['config'].get('digest', None)
|
||||
|
||||
|
||||
def set_rc(new_rc):
|
||||
global rc
|
||||
rc = new_rc if new_rc > rc else rc
|
||||
|
||||
|
||||
def ok(message):
|
||||
set_rc(OK_RC)
|
||||
messages.append('OK: ' + message)
|
||||
|
||||
|
||||
def warning(message):
|
||||
set_rc(WARNING_RC)
|
||||
messages.append('WARNING: ' + message)
|
||||
|
||||
|
||||
def critical(message):
|
||||
set_rc(CRITICAL_RC)
|
||||
messages.append('CRITICAL: ' + message)
|
||||
|
||||
|
||||
def unknown(message):
|
||||
set_rc(UNKNOWN_RC)
|
||||
messages.append('UNKNOWN: ' + message)
|
||||
|
||||
|
||||
def require_running(name):
|
||||
def inner_decorator(func):
|
||||
def wrapper(container, *args, **kwargs):
|
||||
container_state = get_state(container)
|
||||
state = normalize_state(container_state)
|
||||
if state.lower() == "running":
|
||||
func(container, *args, **kwargs)
|
||||
else:
|
||||
# container is not running, can't perform check
|
||||
critical('{container} is not "running", cannot check {check}"'.format(container=container,
|
||||
check=name))
|
||||
|
||||
return wrapper
|
||||
|
||||
return inner_decorator
|
||||
|
||||
|
||||
def multithread_execution(disable_threading=DISABLE_THREADING):
|
||||
def inner_decorator(func):
|
||||
def wrapper(container, *args, **kwargs):
|
||||
if DISABLE_THREADING:
|
||||
func(container, *args, **kwargs)
|
||||
else:
|
||||
threads.append(parallel_executor.submit(func, container, *args, **kwargs))
|
||||
|
||||
return wrapper
|
||||
|
||||
return inner_decorator
|
||||
|
||||
|
||||
def singlethread_execution(disable_threading=DISABLE_THREADING):
|
||||
def inner_decorator(func):
|
||||
def wrapper(container, *args, **kwargs):
|
||||
if DISABLE_THREADING:
|
||||
func(container, *args, **kwargs)
|
||||
else:
|
||||
threads.append(serial_executor.submit(func, container, *args, **kwargs))
|
||||
|
||||
return wrapper
|
||||
|
||||
return inner_decorator
|
||||
|
||||
|
||||
def parse_image_name(image_name):
|
||||
"""
|
||||
Parses image names into their constituent parts.
|
||||
:param image_name:
|
||||
:return: ImageName
|
||||
"""
|
||||
|
||||
# These are based on information found here
|
||||
# https://docs.docker.com/engine/reference/commandline/tag/#extended-description
|
||||
# https://github.com/docker/distribution/blob/master/reference/regexp.go
|
||||
host_segment_re = '[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?'
|
||||
hostname_re = r'({host_segment}\.)+{host_segment}'.format(host_segment=host_segment_re)
|
||||
registry_re = r'((?P<registry>({hostname_re}(:\d+)?|{host_segment_re}:\d+))/)'.format(
|
||||
host_segment_re=host_segment_re, hostname_re=hostname_re)
|
||||
name_component_ends_re = '[a-z0-9]'
|
||||
name_component_middle_re = '[a-z0-9._-]' # Ignoring spec limit of two _
|
||||
name_component_re = '({end}{middle}*{end}|{end})'.format(end=name_component_ends_re,
|
||||
middle=name_component_middle_re)
|
||||
image_name_re = "(?P<image_name>({name_component}/)*{name_component})".format(name_component=name_component_re)
|
||||
image_tag_re = '(?P<image_tag>[a-zA-Z0-9_][a-zA-Z0-9_.-]*)'
|
||||
full_re = '^{registry}?{image_name}(:{image_tag})?$'.format(registry=registry_re, image_name=image_name_re,
|
||||
image_tag=image_tag_re)
|
||||
parsed = re.match(full_re, image_name)
|
||||
|
||||
registry = parsed.group('registry') if parsed.group('registry') else DEFAULT_PUBLIC_REGISTRY
|
||||
|
||||
image_name = parsed.group('image_name')
|
||||
image_name = image_name if '/' in image_name or registry != DEFAULT_PUBLIC_REGISTRY else 'library/' + image_name
|
||||
|
||||
image_tag = parsed.group('image_tag')
|
||||
image_tag = image_tag if image_tag else 'latest'
|
||||
|
||||
full_image_name = "{registry}/{image_name}:{image_tag}".format(
|
||||
registry=registry,
|
||||
image_name=image_name,
|
||||
image_tag=image_tag)
|
||||
|
||||
return ImageName(registry=registry, name=image_name, tag=image_tag, full_name=full_image_name)
|
||||
|
||||
|
||||
def normalize_state(status_info):
|
||||
# Ugh, docker used to report state in as silly way then they figured out how to do it better.
|
||||
# This tries the simpler new way and if that doesn't work fails back to the old way
|
||||
|
||||
# On new docker engines the status holds whatever the current state is, running, stopped, paused, etc.
|
||||
if "Status" in status_info:
|
||||
return status_info['Status']
|
||||
|
||||
status = 'Exited'
|
||||
if status_info["Restarting"]:
|
||||
status = 'Restarting'
|
||||
elif status_info["Paused"]:
|
||||
status = 'Paused'
|
||||
elif status_info["Dead"]:
|
||||
status = 'Dead'
|
||||
elif status_info["Running"]:
|
||||
return "Running"
|
||||
return status
|
||||
|
||||
|
||||
# Checks
|
||||
#############################################################################################
|
||||
|
||||
@multithread_execution()
|
||||
@require_running(name='memory')
|
||||
def check_memory(container, thresholds):
|
||||
if not thresholds.units in unit_adjustments:
|
||||
unknown("Memory units must be one of {}".format(list(unit_adjustments.keys())))
|
||||
return
|
||||
|
||||
inspection = get_stats(container)
|
||||
|
||||
# Subtracting cache to match what `docker stats` does.
|
||||
adjusted_usage = inspection['memory_stats']['usage'] - inspection['memory_stats']['stats']['total_cache']
|
||||
if thresholds.units == '%':
|
||||
max = 100
|
||||
usage = int(100 * adjusted_usage / inspection['memory_stats']['limit'])
|
||||
else:
|
||||
max = inspection['memory_stats']['limit'] / unit_adjustments[thresholds.units]
|
||||
usage = adjusted_usage / unit_adjustments[thresholds.units]
|
||||
|
||||
evaluate_numeric_thresholds(container=container, value=usage, thresholds=thresholds, name='memory',
|
||||
short_name='mem', min=0, max=max)
|
||||
|
||||
|
||||
@multithread_execution()
|
||||
def check_status(container, desired_state):
|
||||
normized_desired_state = desired_state.lower()
|
||||
normalized_state = normalize_state(get_state(container)).lower()
|
||||
if normized_desired_state != normalized_state:
|
||||
critical("{} state is not {}".format(container, desired_state))
|
||||
return
|
||||
ok("{} status is {}".format(container, desired_state))
|
||||
|
||||
|
||||
@multithread_execution()
|
||||
@require_running('health')
|
||||
def check_health(container):
|
||||
state = get_state(container)
|
||||
if "Health" in state and "Status" in state["Health"]:
|
||||
health = state["Health"]["Status"]
|
||||
message = "{} is {}".format(container, health)
|
||||
if health == 'healthy':
|
||||
ok(message)
|
||||
elif health == 'unhealthy':
|
||||
critical(message)
|
||||
else:
|
||||
unknown(message)
|
||||
else:
|
||||
unknown('{} has no health check data'.format(container))
|
||||
|
||||
|
||||
@multithread_execution()
|
||||
@require_running('uptime')
|
||||
def check_uptime(container, thresholds):
|
||||
inspection = get_container_info(container)['State']['StartedAt']
|
||||
only_secs = inspection[0:19]
|
||||
start = datetime.strptime(only_secs, "%Y-%m-%dT%H:%M:%S")
|
||||
start = start.replace(tzinfo=timezone.utc)
|
||||
now = datetime.now(timezone.utc)
|
||||
uptime = (now - start).total_seconds()
|
||||
|
||||
graph_padding = 2
|
||||
thresholds.units = 's'
|
||||
evaluate_numeric_thresholds(container=container, value=uptime, thresholds=thresholds, name='uptime',
|
||||
short_name='up', min=0, max=graph_padding, greater_than=False)
|
||||
|
||||
|
||||
@multithread_execution()
|
||||
def check_image_age(container, thresholds):
|
||||
container_image = get_container_info(container)['Image']
|
||||
image_created = get_image_info(container_image)['Created']
|
||||
only_secs = image_created[0:19]
|
||||
start = datetime.strptime(only_secs, "%Y-%m-%dT%H:%M:%S")
|
||||
start = start.replace(tzinfo=timezone.utc)
|
||||
now = datetime.now(timezone.utc)
|
||||
image_age = (now - start).days
|
||||
|
||||
graph_padding = 2
|
||||
thresholds.units = 'd'
|
||||
evaluate_numeric_thresholds(container=container, value=image_age, thresholds=thresholds, name='image_age',
|
||||
short_name='age', min=0, max=graph_padding, greater_than=True)
|
||||
|
||||
|
||||
@multithread_execution()
|
||||
@require_running('restarts')
|
||||
def check_restarts(container, thresholds):
|
||||
inspection = get_container_info(container)
|
||||
|
||||
restarts = int(inspection['RestartCount'])
|
||||
graph_padding = 2
|
||||
evaluate_numeric_thresholds(container=container, value=restarts, thresholds=thresholds, name='restarts',
|
||||
short_name='re', min=0, max=graph_padding)
|
||||
|
||||
|
||||
@singlethread_execution()
|
||||
def check_version(container, insecure_registries):
|
||||
image_id = get_container_image_id(container)
|
||||
logger.debug("Local container image ID: {}".format(image_id))
|
||||
if image_id is None:
|
||||
unknown('Checksum missing for "{}", try doing a pull'.format(container))
|
||||
return
|
||||
|
||||
image_urls = get_container_image_urls(container=container)
|
||||
if len(image_urls) > 1:
|
||||
unknown('"{}" has multiple tags/names. Unsure which one to use to check the version.'.format(container))
|
||||
return
|
||||
elif len(image_urls) == 0:
|
||||
unknown('"{}" has last no repository tag. Is this anywhere else?'.format(container))
|
||||
return
|
||||
|
||||
url, registry = normalize_image_name_to_manifest_url(image_urls[0], insecure_registries)
|
||||
logger.debug("Looking up image digest here {}".format(url))
|
||||
try:
|
||||
registry_hash = get_digest_from_registry(url)
|
||||
except URLError as e:
|
||||
if hasattr(e.reason, 'reason') and e.reason.reason == 'UNKNOWN_PROTOCOL':
|
||||
unknown(
|
||||
"TLS error connecting to registry {} for {}, should you use the '--insecure-registry' flag?" \
|
||||
.format(registry, container))
|
||||
return
|
||||
elif hasattr(e.reason, 'strerror') and e.reason.strerror == 'nodename nor servname provided, or not known':
|
||||
unknown(
|
||||
"Cannot reach registry for {} at {}".format(container, url))
|
||||
return
|
||||
else:
|
||||
raise e
|
||||
except RegistryError as e:
|
||||
unknown("Cannot check version, couldn't retrieve digest for {} while checking {}.".format(container, url))
|
||||
return
|
||||
logger.debug("Image digests, local={} remote={}".format(image_id, registry_hash))
|
||||
if registry_hash == image_id:
|
||||
ok("{}'s version matches registry".format(container))
|
||||
return
|
||||
critical("{}'s version does not match registry".format(container))
|
||||
|
||||
|
||||
def calculate_cpu_capacity_precentage(info, stats):
|
||||
host_config = info['HostConfig']
|
||||
|
||||
if 'online_cpus' in stats['cpu_stats']:
|
||||
num_cpus = stats['cpu_stats']['online_cpus']
|
||||
else:
|
||||
num_cpus = len(stats['cpu_stats']['cpu_usage']['percpu_usage'])
|
||||
|
||||
# Identify limit system being used
|
||||
# --cpus
|
||||
if 'NanoCpus' in host_config and host_config['NanoCpus'] != 0:
|
||||
period = 1000000000
|
||||
quota = host_config['NanoCpus']
|
||||
# --cpu-quota
|
||||
elif 'CpuQuota' in host_config and host_config['CpuQuota'] != 0:
|
||||
period = 100000 if host_config['CpuPeriod'] == 0 else host_config['CpuPeriod']
|
||||
quota = host_config['CpuQuota']
|
||||
# unlimited
|
||||
else:
|
||||
period = 1
|
||||
quota = num_cpus
|
||||
|
||||
if period * num_cpus < quota:
|
||||
# This handles the case where the quota is actually bigger than amount available by all the cpus.
|
||||
available_limit_ratio = 1
|
||||
else:
|
||||
available_limit_ratio = (period * num_cpus) / quota
|
||||
|
||||
cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage']
|
||||
system_delta = stats['cpu_stats']['system_cpu_usage'] - stats['precpu_stats']['system_cpu_usage']
|
||||
usage = (cpu_delta / system_delta) * available_limit_ratio
|
||||
usage = round(usage * 100, 0)
|
||||
return usage
|
||||
|
||||
|
||||
@multithread_execution()
|
||||
@require_running('cpu')
|
||||
def check_cpu(container, thresholds):
|
||||
info = get_container_info(container)
|
||||
|
||||
stats = get_stats(container=container)
|
||||
|
||||
usage = calculate_cpu_capacity_precentage(info=info, stats=stats)
|
||||
|
||||
max = 100
|
||||
thresholds.units = '%'
|
||||
evaluate_numeric_thresholds(container=container, value=usage, thresholds=thresholds, name='cpu', short_name='cpu',
|
||||
min=0, max=max)
|
||||
|
||||
|
||||
def process_args(args):
|
||||
parser = argparse.ArgumentParser(description='Check docker containers.')
|
||||
|
||||
# Connect to local socket or ip address
|
||||
connection_group = parser.add_mutually_exclusive_group()
|
||||
connection_group.add_argument('--connection',
|
||||
dest='connection',
|
||||
action='store',
|
||||
default=DEFAULT_SOCKET,
|
||||
type=str,
|
||||
metavar='[/<path to>/docker.socket|<ip/host address>:<port>]',
|
||||
help='Where to find docker daemon socket. (default: %(default)s)')
|
||||
|
||||
connection_group.add_argument('--secure-connection',
|
||||
dest='secure_connection',
|
||||
action='store',
|
||||
type=str,
|
||||
metavar='[<ip/host address>:<port>]',
|
||||
help='Where to find TLS protected docker daemon socket.')
|
||||
|
||||
base_group = parser.add_mutually_exclusive_group()
|
||||
base_group.add_argument('--binary_units',
|
||||
dest='units_base',
|
||||
action='store_const',
|
||||
const=1024,
|
||||
help='Use a base of 1024 when doing calculations of KB, MB, GB, & TB (This is default)')
|
||||
|
||||
base_group.add_argument('--decimal_units',
|
||||
dest='units_base',
|
||||
action='store_const',
|
||||
const=1000,
|
||||
help='Use a base of 1000 when doing calculations of KB, MB, GB, & TB')
|
||||
parser.set_defaults(units_base=1024)
|
||||
|
||||
# Connection timeout
|
||||
parser.add_argument('--timeout',
|
||||
dest='timeout',
|
||||
action='store',
|
||||
type=float,
|
||||
default=DEFAULT_TIMEOUT,
|
||||
help='Connection timeout in seconds. (default: %(default)s)')
|
||||
|
||||
# Container name
|
||||
parser.add_argument('--containers',
|
||||
dest='containers',
|
||||
action='store',
|
||||
nargs='+',
|
||||
type=str,
|
||||
default=['all'],
|
||||
help='One or more RegEx that match the names of the container(s) to check. If omitted all containers are checked. (default: %(default)s)')
|
||||
|
||||
# Container name
|
||||
parser.add_argument('--present',
|
||||
dest='present',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help='Modifies --containers so that each RegEx must match at least one container.')
|
||||
|
||||
# Threads
|
||||
parser.add_argument('--threads',
|
||||
dest='threads',
|
||||
default=DEFAULT_PARALLELISM,
|
||||
action='store',
|
||||
type=int,
|
||||
help='This + 1 is the maximum number of concurent threads/network connections. (default: %(default)s)')
|
||||
|
||||
# CPU
|
||||
parser.add_argument('--cpu',
|
||||
dest='cpu',
|
||||
action='store',
|
||||
type=str,
|
||||
metavar='WARN:CRIT',
|
||||
help='Check cpu usage percentage taking into account any limits.')
|
||||
|
||||
# Memory
|
||||
parser.add_argument('--memory',
|
||||
dest='memory',
|
||||
action='store',
|
||||
type=str,
|
||||
metavar='WARN:CRIT:UNITS',
|
||||
help='Check memory usage taking into account any limits. Valid values for units are %%,B,KB,MB,GB.')
|
||||
|
||||
# State
|
||||
parser.add_argument('--status',
|
||||
dest='status',
|
||||
action='store',
|
||||
type=str,
|
||||
help='Desired container status (running, exited, etc).')
|
||||
|
||||
# Health
|
||||
parser.add_argument('--health',
|
||||
dest='health',
|
||||
default=None,
|
||||
action='store_true',
|
||||
help="Check container's health check status")
|
||||
|
||||
# Age
|
||||
parser.add_argument('--uptime',
|
||||
dest='uptime',
|
||||
action='store',
|
||||
type=str,
|
||||
metavar='WARN:CRIT',
|
||||
help='Minimum container uptime in seconds. Use when infrequent crashes are tolerated.')
|
||||
|
||||
# Image Age
|
||||
parser.add_argument('--image-age',
|
||||
dest='image_age',
|
||||
action='store',
|
||||
type=str,
|
||||
metavar='WARN:CRIT',
|
||||
help='Maximum image age in days.')
|
||||
|
||||
# Version
|
||||
parser.add_argument('--version',
|
||||
dest='version',
|
||||
default=None,
|
||||
action='store_true',
|
||||
help='Check if the running images are the same version as those in the registry. Useful for finding stale images. Does not support login.')
|
||||
|
||||
# Version
|
||||
parser.add_argument('--insecure-registries',
|
||||
dest='insecure_registries',
|
||||
action='store',
|
||||
nargs='+',
|
||||
type=str,
|
||||
default=[],
|
||||
help='List of registries to connect to with http(no TLS). Useful when using "--version" with images from insecure registries.')
|
||||
|
||||
# Restart
|
||||
parser.add_argument('--restarts',
|
||||
dest='restarts',
|
||||
action='store',
|
||||
type=str,
|
||||
metavar='WARN:CRIT',
|
||||
help='Container restart thresholds.')
|
||||
|
||||
# no-ok
|
||||
parser.add_argument('--no-ok',
|
||||
dest='no_ok',
|
||||
action='store_true',
|
||||
help='Make output terse suppressing OK messages. If all checks are OK return a single OK.')
|
||||
|
||||
# no-performance
|
||||
parser.add_argument('--no-performance',
|
||||
dest='no_performance',
|
||||
action='store_true',
|
||||
help='Suppress performance data. Reduces output when performance data is not being used.')
|
||||
|
||||
parser.add_argument('-V', action='version', version='%(prog)s {}'.format(__version__))
|
||||
|
||||
if len(args) == 0:
|
||||
parser.print_help()
|
||||
|
||||
parsed_args = parser.parse_args(args=args)
|
||||
|
||||
global timeout
|
||||
timeout = parsed_args.timeout
|
||||
|
||||
global daemon
|
||||
global connection_type
|
||||
if parsed_args.secure_connection:
|
||||
daemon = 'https://' + parsed_args.secure_connection
|
||||
connection_type = 'https'
|
||||
elif parsed_args.connection:
|
||||
if parsed_args.connection[0] == '/':
|
||||
daemon = 'socket://' + parsed_args.connection + ':'
|
||||
connection_type = 'socket'
|
||||
else:
|
||||
daemon = 'http://' + parsed_args.connection
|
||||
connection_type = 'http'
|
||||
|
||||
return parsed_args
|
||||
|
||||
|
||||
def no_checks_present(parsed_args):
|
||||
# Look for all functions whose name starts with 'check_'
|
||||
checks = [key[6:] for key in globals().keys() if key.startswith('check_')]
|
||||
# Act like --present is a check though it is not implemented like one
|
||||
return all(getattr(parsed_args, check) is None for check in checks) and not parsed_args.present
|
||||
|
||||
|
||||
def socketfile_permissions_failure(parsed_args):
|
||||
if connection_type == 'socket':
|
||||
return not (os.path.exists(parsed_args.connection)
|
||||
and stat.S_ISSOCK(os.stat(parsed_args.connection).st_mode)
|
||||
and os.access(parsed_args.connection, os.R_OK)
|
||||
and os.access(parsed_args.connection, os.W_OK))
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def print_results():
|
||||
if no_ok:
|
||||
# Remove all the "OK"s
|
||||
filtered_messages = [message for message in messages if not message.startswith('OK: ')]
|
||||
if len(filtered_messages) == 0:
|
||||
messages_concat = 'OK'
|
||||
else:
|
||||
messages_concat = '; '.join(filtered_messages)
|
||||
|
||||
else:
|
||||
messages_concat = '; '.join(messages)
|
||||
|
||||
if no_performance or len(performance_data) == 0:
|
||||
print(messages_concat)
|
||||
else:
|
||||
perfdata_concat = ' '.join(performance_data)
|
||||
print(messages_concat + '|' + perfdata_concat)
|
||||
|
||||
|
||||
def perform_checks(raw_args):
|
||||
args = process_args(raw_args)
|
||||
|
||||
global parallel_executor
|
||||
parallel_executor = futures.ThreadPoolExecutor(max_workers=args.threads)
|
||||
global serial_executor
|
||||
serial_executor = futures.ThreadPoolExecutor(max_workers=1)
|
||||
|
||||
global unit_adjustments
|
||||
unit_adjustments = {key: args.units_base ** value for key, value in UNIT_ADJUSTMENTS_TEMPLATE.items()}
|
||||
|
||||
global no_ok
|
||||
no_ok = args.no_ok
|
||||
|
||||
global no_performance
|
||||
no_performance = args.no_ok
|
||||
|
||||
if socketfile_permissions_failure(args):
|
||||
unknown("Cannot access docker socket file. User ID={}, socket file={}".format(os.getuid(), args.connection))
|
||||
return
|
||||
|
||||
if args.containers == ["all"] and args.present:
|
||||
unknown("You can not use --present without --containers")
|
||||
return
|
||||
|
||||
if no_checks_present(args):
|
||||
unknown("No checks specified.")
|
||||
return
|
||||
|
||||
# Here is where all the work happens
|
||||
#############################################################################################
|
||||
containers = get_containers(args.containers, args.present)
|
||||
|
||||
if len(containers) == 0 and not args.present:
|
||||
unknown("No containers names found matching criteria")
|
||||
return
|
||||
|
||||
for container in containers:
|
||||
|
||||
# Check status
|
||||
if args.status:
|
||||
check_status(container, args.status)
|
||||
|
||||
# Check version
|
||||
if args.version:
|
||||
check_version(container, args.insecure_registries)
|
||||
|
||||
# below are checks that require a 'running' status
|
||||
|
||||
# Check status
|
||||
if args.health:
|
||||
check_health(container)
|
||||
|
||||
# Check cpu usage
|
||||
if args.cpu:
|
||||
check_cpu(container, parse_thresholds(args.cpu, units_required=False))
|
||||
|
||||
# Check memory usage
|
||||
if args.memory:
|
||||
check_memory(container, parse_thresholds(args.memory, units_required=False))
|
||||
|
||||
# Check uptime
|
||||
if args.uptime:
|
||||
check_uptime(container, parse_thresholds(args.uptime, include_units=False))
|
||||
|
||||
# Check image age
|
||||
if args.image_age:
|
||||
check_image_age(container, parse_thresholds(args.image_age, include_units=False))
|
||||
|
||||
# Check restart count
|
||||
if args.restarts:
|
||||
check_restarts(container, parse_thresholds(args.restarts, include_units=False))
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
perform_checks(argv[1:])
|
||||
|
||||
# get results to let exceptions in threads bubble out
|
||||
[x.result() for x in futures.as_completed(threads)]
|
||||
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
unknown("Exception raised during check': {}".format(repr(e)))
|
||||
print_results()
|
||||
exit(rc)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -7,17 +7,25 @@
|
||||
#
|
||||
|
||||
rr="/var/run/reboot-required"
|
||||
rrpkgs="/var/run/reboot-required.pkgs"
|
||||
# 604800 - 1 week in seconds
|
||||
threshold="${1:-604800}"
|
||||
if [ -f "$rr" ]; then
|
||||
# We have a pending reboot; alert in different states depending on its age
|
||||
lastmod=$(date +%s -r "$rr")
|
||||
now=$(date +%s)
|
||||
pkgs="$(cat "$rr")"
|
||||
if [ -f "$rrpkgs" ]; then
|
||||
pkgs="$(cat "$rrpkgs")"
|
||||
fi
|
||||
if [ -z "$pkgs" ]; then
|
||||
pkgs="(No output)"
|
||||
fi
|
||||
if (( now - lastmod > threshold )); then
|
||||
echo "CRITICAL - Pending reboot older than $threshold seconds: $(cat "$rr")"
|
||||
echo "CRITICAL - Pending reboot older than $threshold seconds: $pkgs"
|
||||
exit 2
|
||||
else
|
||||
echo "WARNING - Pending reboot: $(cat "$rr")"
|
||||
echo "WARNING - Pending reboot: $pkgs"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
|
Loading…
Reference in New Issue
Block a user