"""This module implements health status reporting for watchdog operation.
Module receive important health metrics and exports its status of overall
health assessment. This health assessment can be used by external watchdog
scripts to initiate agent restart.
Process is considered "healthy" if:
* it is being shut down and shutdown timeout has not elapsed -> HEALTHY
* it is not registered -> HEALTHY
* process was started more than 6 hours ago and no data was sent to server
within last 6 hours -> FAULTY
* process was started more than 18 hours ago and no data was received from
server within last 18 hours -> FAULTY
Otherwise process is considered HEALTHY.
As agent exports this information through RPC interface there is an additional
implicit "health" requirement that:
* it responds to RPC requests.
This implicit requirement considered valid because UI fully depends on RPC
so it does not make health assessment any worse than it should."""
import collections
HealthStatus = collections.namedtuple("HealthStatus", ["healthy", "why"])
class HealthSensor:
"""HealthSensor receives events about agent operation and provides
information about overall status.
Initially, new HealthSensor object assumes:
* process was started long ago;
* process is not being shut down;
* data from server has been received long ago;
* data to server was sent long ago;
* agent is registered (license is valid).
So, initial health status is False (faulty)."""
RECEIVE_WINDOW = 18 * 3600
SEND_WINDOW = 6 * 3600
SHUTDOWN_TIMEOUT = 600
def __init__(self):
self._started_at = 0.0
self._shutdown_at = 0.0
self._last_received = 0.0
self._last_sent = 0.0
self._is_registered = True
def starting(self, when: float) -> None:
"""Records a moment of agent startup"""
self._started_at = when
def shutting_down(self, when: float) -> None:
"""Records a moment of externally initiated agent shutdown"""
self._shutdown_at = when
def server_data_received(self, when: float) -> None:
"""Records a moment when data was received from server"""
self._last_received = when
def server_data_sent(self, when: float) -> None:
"""Records a moment when data was sent to server"""
self._last_sent = when
def registered(self) -> None:
"""Marks agent as being registered"""
self._is_registered = True
def unregistered(self) -> None:
"""Marks agent as being not registered"""
self._is_registered = False
def status(self, now: float) -> HealthStatus:
if self._shutdown_at > 0:
if now - self._shutdown_at >= self.SHUTDOWN_TIMEOUT:
return HealthStatus(False, "stuck at shutdown")
return HealthStatus(True, "shutdown is in progress")
if not self._is_registered:
return HealthStatus(True, "not registered")
if (
now - self._started_at >= self.RECEIVE_WINDOW
and now - self._last_received >= self.RECEIVE_WINDOW
):
return HealthStatus(False, "no data received from server")
if (
now - self._started_at >= self.SEND_WINDOW
and now - self._last_sent >= self.SEND_WINDOW
):
return HealthStatus(False, "no data sent to server")
return HealthStatus(True, "all is ok")
sensor = HealthSensor()