# Copyright 2016 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""
Framework for host verification and repair in Autotest.
The framework provides implementation code in support of `Host.verify()`
and `Host.repair()` used in Verify and Repair special tasks.
The framework consists of these classes:
* `Verifier`: A class representing a single verification check.
* `RepairAction`: A class representing a repair operation that can fix
a failed verification check.
* `RepairStrategy`: A class for organizing a collection of `Verifier`
and `RepairAction` instances, and invoking them in order.
Individual operations during verification and repair are handled by
instances of `Verifier` and `RepairAction`. `Verifier` objects are
meant to test for specific conditions that may cause tests to fail.
`RepairAction` objects provide operations designed to fix one or
more failures identified by a `Verifier` object.
"""
import collections
import logging
import re
import common
from autotest_lib.client.common_lib import error
try:
from chromite.lib import metrics
except ImportError:
from autotest_lib.client.bin.utils import metrics_mock as metrics
#Regular experssion pattern to filter out unwanted hostname.
_HOSTNAME_PATTERN = 'chromeos[0-9]-row[0-9]+[a-z]?-rack[0-9]+[a-z]?-host[0-9]+'
_DISALLOWED_HOSTNAME = 'disallowed_hostname'
class AutoservVerifyError(error.AutoservError):
"""
Generic Exception for failures from `Verifier` objects.
Instances of this exception can be raised when a `verify()`
method fails, if no more specific exception is available.
"""
pass
_DependencyFailure = collections.namedtuple(
'_DependencyFailure', ('dependency', 'error', 'tag'))
class AutoservVerifyDependencyError(error.AutoservError):
"""
Exception raised for failures in dependencies.
This exception is used to distinguish an original failure from a
failure being passed back from a verification dependency. That is,
if 'B' depends on 'A', and 'A' fails, 'B' will raise this exception
to signal that the original failure is further down the dependency
chain.
The `failures` argument to the constructor for this class is a set
of instances of `_DependencyFailure`, each corresponding to one
failed dependency:
* The `dependency` attribute of each failure is the description
of the failed dependency.
* The `error` attribute of each failure is the string value of
the exception from the failed dependency.
Multiple methods in this module recognize and handle this exception
specially.
@property failures Set of failures passed to the constructor.
@property _node Instance of `_DependencyNode` reporting the
failed dependencies.
"""
def __init__(self, node, failures):
"""
Constructor for `AutoservVerifyDependencyError`.
@param node Instance of _DependencyNode reporting the
failed dependencies.
@param failures List of failure tuples as described above.
"""
super(AutoservVerifyDependencyError, self).__init__(
'\n'.join([f.error for f in failures]))
self.failures = failures
self._node = node
def log_dependencies(self, action, deps):
"""
Log an `AutoservVerifyDependencyError`.
This writes a short summary of the dependency failures captured
in this exception, using standard Python logging.
The passed in `action` string plus `self._node.description`
are logged at INFO level. The `action` argument should
introduce or describe an action relative to `self._node`.
The passed in `deps` string and the description of each failed
dependency in `self` are be logged at DEBUG level. The `deps`
argument is used to introduce the various failed dependencies.
@param action A string mentioning the action being logged
relative to `self._node`.
@param deps A string introducing the dependencies that
failed.
"""
logging.info('%s: %s', action, self._node.description)
logging.debug('%s:', deps)
for failure in self.failures:
logging.debug(' %s', failure.dependency)
class AutoservRepairError(error.AutoservError):
"""
Generic Exception for failures from `RepairAction` objects.
Instances of this exception can be raised when a `repair()`
method fails, if no more specific exception is available.
"""
def __init__(self, description, tag):
"""
@param description Message describe the exception.
@param tag A short identifier used for metric purpose.
"""
super(AutoservRepairError, self).__init__(description)
self.tag = tag
class _DependencyNode(object):
"""
An object that can depend on verifiers.
Both repair and verify operations have the notion of dependencies
that must pass before the operation proceeds. This class captures
the shared behaviors required by both classes.
@property tag Short identifier to be used in logging.
@property description Text summary of this node's action, to be
used in debug logs.
@property _dependency_list Dependency pre-requisites.
"""
def __init__(self, tag, record_type, dependencies):
self._dependency_list = dependencies
self._tag = tag
self._record_tag = record_type + '.' + tag
def _record(self, host, silent, status_code, *record_args):
"""
Log a status record for `host`.
Call `host.record()` using the given status_code, and
operation tag `self._record_tag`, plus any extra arguments in
`record_args`. Do nothing if `silent` is a true value.
@param host Host which will record the status record.
@param silent Don't record the event if this is a true
value.
@param status_code Value for the `status_code` parameter to
`host.record()`.
@param record_args Additional arguments to pass to
`host.record()`.
"""
if not silent:
host.record(status_code, None, self._record_tag,
*record_args)
def _record_good(self, host, silent):
"""Log a 'GOOD' status line.
@param host Host which will record the status record.
@param silent Don't record the event if this is a true
value.
"""
self._record(host, silent, 'GOOD')
def _record_fail(self, host, silent, exc):
"""Log a 'FAIL' status line.
@param host Host which will record the status record.
@param silent Don't record the event if this is a true
value.
@param exc Exception describing the cause of failure.
"""
self._record(host, silent, 'FAIL', str(exc))
def _verify_list(self, host, verifiers, silent):
"""
Test a list of verifiers against a given host.
This invokes `_verify_host()` on every verifier in the given
list. If any verifier in the transitive closure of dependencies
in the list fails, an `AutoservVerifyDependencyError` is raised
containing the description of each failed verifier. Only
original failures are reported; verifiers that don't run due
to a failed dependency are omitted.
By design, original failures are logged once in `_verify_host()`
when `verify()` originally fails. The additional data gathered
here is for the debug logs to indicate why a subsequent
operation never ran.
@param host The host to be tested against the verifiers.
@param verifiers List of verifiers to be checked.
@param silent If true, don't log host status records.
@raises AutoservVerifyDependencyError Raised when at least
one verifier in the list has failed.
"""
failures = set()
for v in verifiers:
try:
v._verify_host(host, silent)
except AutoservVerifyDependencyError as e:
failures.update(e.failures)
except Exception as e:
failures.add(_DependencyFailure(v.description, str(e), v.tag))
if failures:
raise AutoservVerifyDependencyError(self, failures)
def _verify_dependencies(self, host, silent):
"""
Verify that all of this node's dependencies pass for a host.
@param host The host to be verified.
@param silent If true, don't log host status records.
"""
try:
self._verify_list(host, self._dependency_list, silent)
except AutoservVerifyDependencyError as e:
e.log_dependencies(
'Skipping this operation',
'The following dependencies failed')
raise
@property
def tag(self):
"""
Tag for use in logging status records.
This is a property with a short string used to identify the node
in the 'status.log' file and during node construction. The tag
should contain only letters, digits, and '_' characters. This
tag is not used alone, but is combined with other identifiers,
based on the operation being logged.
@return A short identifier-like string.
"""
return self._tag
@property
def description(self):
"""
Text description of this node for log messages.
This string will be logged with failures, and should describe
the condition required for success.
N.B. Subclasses are required to override this method, but we
_don't_ raise NotImplementedError here. Various methods fail in
inscrutable ways if this method raises any exception, so for
debugging purposes, it's better to return a default value.
@return A descriptive string.
"""
return ('Class %s fails to implement description().' %
type(self).__name__)
class Verifier(_DependencyNode):
"""
Abstract class embodying one verification check.
A concrete subclass of `Verifier` provides a simple check that can
determine a host's fitness for testing. Failure indicates that the
check found a problem that can cause at least one test to fail.
`Verifier` objects are organized in a DAG identifying dependencies
among operations. The DAG controls ordering and prevents wasted
effort: If verification operation V2 requires that verification
operation V1 pass, then a) V1 will run before V2, and b) if V1
fails, V2 won't run at all. The `_verify_host()` method ensures
that all dependencies run and pass before invoking the `verify()`
method.
A `Verifier` object caches its result the first time it calls
`verify()`. Subsequent calls return the cached result, without
re-running the check code. The `_reverify()` method clears the
cached result in the current node, and in all dependencies.
Subclasses must supply these properties and methods:
* `verify()`: This is the method to perform the actual
verification check.
* `description`: A one-line summary of the verification check for
debug log messages.
Subclasses must override all of the above attributes; subclasses
should not override or extend any other attributes of this class.
The description string should be a simple sentence explaining what
must be true for the verifier to pass. Do not include a terminating
period. For example:
Host is available via ssh
The base class manages the following private data:
* `_result`: The cached result of verification.
* `_dependency_list`: The list of dependencies.
Subclasses should not use these attributes.
@property _result Cached result of verification.
"""
def __init__(self, tag, dependencies):
super(Verifier, self).__init__(tag, 'verify', dependencies)
self._result = None
def _reverify(self):
"""
Discard cached verification results.
Reset the cached verification result for this node, and for the
transitive closure of all dependencies.
"""
if self._result is not None:
self._result = None
for v in self._dependency_list:
v._reverify()
def _verify_host(self, host, silent):
"""
Determine the result of verification, and log results.
If this verifier does not have a cached verification result,
check dependencies, and if they pass, run `verify()`. Log
informational messages regarding failed dependencies. If we
call `verify()`, log the result in `status.log`.
If we already have a cached result, return that result without
logging any message.
@param host The host to be tested for a problem.
@param silent If true, don't log host status records.
"""
if self._result is not None:
if isinstance(self._result, Exception):
raise self._result # cached failure
elif self._result:
return # cached success
self._result = False
self._verify_dependencies(host, silent)
logging.info('Verifying this condition: %s', self.description)
try:
self.verify(host)
self._record_good(host, silent)
except Exception as e:
logging.exception('Failed: %s', self.description)
self._result = e
self._record_fail(host, silent, e)
raise
self._result = True
def verify(self, host):
"""
Unconditionally perform a verification check.
This method is responsible for testing for a single problem on a
host. Implementations should follow these guidelines:
* The check should find a problem that will cause testing to
fail.
* Verification checks on a working system should run quickly
and should be optimized for success; a check that passes
should finish within seconds.
* Verification checks are not expected have side effects, but
may apply trivial fixes if they will finish within the time
constraints above.
A verification check should normally trigger a single set of
repair actions. If two different failures can require two
different repairs, ideally they should use two different
subclasses of `Verifier`.
Implementations indicate failure by raising an exception. The
exception text should be a short, 1-line summary of the error.
The text should be concise and diagnostic, as it will appear in
`status.log` files.
If this method finds no problems, it returns without raising any
exception.
Implementations should avoid most logging actions, but can log
DEBUG level messages if they provide significant information for
diagnosing failures.
@param host The host to be tested for a problem.
"""
raise NotImplementedError('Class %s does not implement '
'verify()' % type(self).__name__)
class RepairAction(_DependencyNode):
"""
Abstract class embodying one repair procedure.
A `RepairAction` is responsible for fixing one or more failed
`Verifier` checks, in order to make those checks pass.
Each repair action includes one or more verifier triggers that
determine when the repair action should run. A repair action
will call its `repair()` method if one or more of its triggers
fails. A repair action is successful if all of its triggers pass
after calling `repair()`.
A `RepairAction` is a subclass of `_DependencyNode`; if any of a
repair action's dependencies fail, the action does not check its
triggers, and doesn't call `repair()`.
Subclasses must supply these attributes:
* `repair()`: This is the method to perform the necessary
repair. The method should avoid most logging actions, but
can log DEBUG level messages if they provide significant
information for diagnosing failures.
* `description`: A one-line summary of the repair action for
debug log messages.
Subclasses must override both of the above attributes and should
not override any other attributes of this class.
The description string should be a simple sentence explaining the
operation that will be performed. Do not include a terminating
period. For example:
Re-install the stable build via AU
@property _trigger_list List of verification checks that will
trigger this repair when they fail.
@property host_class A string identifier that will be
used as a field to send repair metrics.
"""
def __init__(self, tag, dependencies, triggers, host_class):
super(RepairAction, self).__init__(tag, 'repair', dependencies)
self._trigger_list = triggers
self._failure_modes_counter = metrics.Counter(
'chromeos/autotest/repair/failure_modes')
self._failure_detail_counter = metrics.Counter(
'chromeos/autotest/repair/failure_detail')
self.host_class = host_class
def _record_start(self, host, silent):
"""Log a 'START' status line.
@param host Host which will record the status record.
@param silent Don't record the event if this is a true
value.
"""
self._record(host, silent, 'START')
def _record_end_good(self, host, silent):
"""Log an 'END GOOD' status line.
@param host Host which will record the status record.
@param silent Don't record the event if this is a true
value.
"""
self._record(host, silent, 'END GOOD')
self.status = 'repaired'
def _record_end_fail(self, host, silent, status, *args):
"""Log an 'END FAIL' status line.
@param host Host which will record the status record.
@param silent Don't record the event if this is a true
value.
@param args Extra arguments to `self._record()`
"""
self._record(host, silent, 'END FAIL', *args)
self.status = status
def _send_failure_metrics(self, host, error, stage):
"""Send failure mode metrics to monarch
@param host Host which this RepairAction targeted to.
@param error An exception that caught in _repair_host.
@param stage In which stage we caught above exception.
Can be one of below value:
'dep' during verify dependencies
'pre' during pre-repair trigger verification
'repair' during repair() process itself
'post' during post-repair trigger verification
"""
def get_fields(vf_tag):
fields = {
'ra_tag': self.tag,
'vf_tag': vf_tag,
'hostname': _filter_metrics_hostname(host),
'stage': stage,
'host_class': self.host_class
}
return fields
if isinstance(error, AutoservVerifyDependencyError):
# We'll catch all failure tags here for a dependencies error
for f in error.failures:
self._failure_modes_counter.increment(fields=get_fields(f.tag))
else:
# When there is failure during repair or unknown failure. there
# will be no Verifier, so vf_tag set to 'unknown'.
self._failure_modes_counter.increment(fields=get_fields('unknown'))
if stage == 'repair':
self._send_failure_detail(error)
def _send_failure_detail(self, error):
"""Send reason of failure inside repair() to monarch.
@param error The exception caught inside repair().
"""
tag = error.tag if isinstance(error, AutoservRepairError) else 'unknown'
fields = {'repair_action_tag': self.tag, 'repair_failure_tag': tag}
self._failure_detail_counter.increment(fields=fields)
def _repair_host(self, host, silent):
"""
Apply this repair action if any triggers fail.
Repair is triggered when all dependencies are successful, and at
least one trigger fails.
If the `repair()` method triggers, the success or failure of
this operation is logged in `status.log` bracketed by 'START'
and 'END' records. Details of whether or why `repair()`
triggered are written to the debug logs. If repair doesn't
trigger, nothing is logged to `status.log`.
@param host The host to be repaired.
@param silent If true, don't log host status records.
"""
# Note: Every exit path from the method must set `self.status`.
# There's a lot of exit paths, so be careful.
#
# If we're blocked by a failed dependency, we exit with an
# exception. So set status to 'blocked' first.
self.status = 'blocked'
try:
self._verify_dependencies(host, silent)
except Exception as e:
self._send_failure_metrics(host, e, 'dep')
raise
# This is a defensive action. Every path below should overwrite
# this setting, but if it doesn't, we want our status to reflect
# a coding error.
self.status = 'unknown'
try:
self._verify_list(host, self._trigger_list, silent)
except AutoservVerifyDependencyError as e:
e.log_dependencies(
'Attempting this repair action',
'Repairing because these triggers failed')
self._send_failure_metrics(host, e, 'pre')
self._record_start(host, silent)
try:
self.repair(host)
except Exception as e:
logging.exception('Repair failed: %s', self.description)
self._record_fail(host, silent, e)
self._record_end_fail(host, silent, 'repair_failure')
self._send_failure_metrics(host, e, 'repair')
raise
try:
for v in self._trigger_list:
v._reverify()
self._verify_list(host, self._trigger_list, silent)
self._record_end_good(host, silent)
except AutoservVerifyDependencyError as e:
e.log_dependencies(
'This repair action reported success',
'However, these triggers still fail')
self._record_end_fail(host, silent, 'verify_failure')
self._send_failure_metrics(host, e, 'post')
raise AutoservRepairError(
'Some verification checks still fail', 'post_verify')
except Exception:
# The specification for `self._verify_list()` says
# that this can't happen; this is a defensive
# precaution.
self._record_end_fail(host, silent, 'unknown',
'Internal error in repair')
self._send_failure_metrics(host, e, 'post')
raise
else:
self.status = 'skipped'
logging.info('No failed triggers, skipping repair: %s',
self.description)
def repair(self, host):
"""
Apply this repair action to the given host.
This method is responsible for applying changes to fix failures
in one or more verification checks. The repair is considered
successful if the DUT passes the specific checks after this
method completes.
Implementations indicate failure by raising an exception. The
exception text should be a short, 1-line summary of the error.
The text should be concise and diagnostic, as it will appear in
`status.log` files.
If this method completes successfully, it returns without
raising any exception.
Implementations should avoid most logging actions, but can log
DEBUG level messages if they provide significant information for
diagnosing failures.
@param host The host to be repaired.
"""
raise NotImplementedError('Class %s does not implement '
'repair()' % type(self).__name__)
class _RootVerifier(Verifier):
"""
Utility class used by `RepairStrategy`.
A node of this class by itself does nothing; it always passes (if it
can run). This class exists merely to be the root of a DAG of
dependencies in an instance of `RepairStrategy`.
"""
def verify(self, host):
pass
@property
def description(self):
return 'All host verification checks pass'
class RepairStrategy(object):
"""
A class for organizing `Verifier` and `RepairAction` objects.
An instance of `RepairStrategy` is organized as a DAG of `Verifier`
objects, plus a list of `RepairAction` objects. The class provides
methods for invoking those objects in the required order, when
needed:
* The `verify()` method walks the verifier DAG in dependency
order.
* The `repair()` method invokes the repair actions in list order.
Each repair action will invoke its dependencies and triggers as
needed.
# The Verifier DAG
The verifier DAG is constructed from the first argument passed to
the passed to the `RepairStrategy` constructor. That argument is an
iterable consisting of three-element tuples in the form
`(constructor, tag, deps)`:
* The `constructor` value is a callable that creates a `Verifier`
as for the interface of the class constructor. For classes
that inherit the default constructor from `Verifier`, this can
be the class itself.
* The `tag` value is the tag to be associated with the constructed
verifier.
* The `deps` value is an iterable (e.g. list or tuple) of strings.
Each string corresponds to the `tag` member of a `Verifier`
dependency.
The tag names of verifiers in the constructed DAG must all be
unique. The tag name defined by `RepairStrategy.ROOT_TAG` is
reserved and may not be used by any verifier.
In the input data for the constructor, dependencies must appear
before the nodes that depend on them. Thus:
((A, 'a', ()), (B, 'b', ('a',))) # This is valid
((B, 'b', ('a',)), (A, 'a', ())) # This will fail!
Internally, the DAG of verifiers is given unique root node. So,
given this input:
((C, 'c', ()),
(A, 'a', ('c',)),
(B, 'b', ('c',)))
The following DAG is constructed:
Root
/ \
A B
\ /
C
Since nothing depends on `A` or `B`, the root node guarantees that
these two verifiers will both be called and properly logged.
The root node is not directly accessible; however repair actions can
trigger on it by using `RepairStrategy.ROOT_TAG`. Additionally, the
node will be logged in `status.log` whenever `verify()` succeeds.
# The Repair Actions List
The list of repair actions is constructed from the second argument
passed to the passed to the `RepairStrategy` constructor. That
argument is an iterable consisting of four-element tuples in the
form `(constructor, tag, deps, triggers)`:
* The `constructor` value is a callable that creates a
`RepairAction` as for the interface of the class constructor.
For classes that inherit the default constructor from
`RepairAction`, this can be the class itself.
* The `tag` value is the tag to be associated with the constructed
repair action.
* The `deps` value is an iterable (e.g. list or tuple) of strings.
Each string corresponds to the `tag` member of a `Verifier` that
the repair action depends on.
* The `triggers` value is an iterable (e.g. list or tuple) of
strings. Each string corresponds to the `tag` member of a
`Verifier` that can trigger the repair action.
`RepairStrategy` deps and triggers can only refer to verifiers,
not to other repair actions.
"""
# This name is reserved; clients may not use it.
ROOT_TAG = 'PASS'
@staticmethod
def _add_verifier(verifiers, constructor, tag, dep_tags):
"""
Construct and remember a verifier.
Create a `Verifier` using `constructor` and `tag`. Dependencies
for construction are found by looking up `dep_tags` in the
`verifiers` dictionary.
After construction, the new verifier is added to `verifiers`.
@param verifiers Dictionary of verifiers, indexed by tag.
@param constructor Verifier construction function.
@param tag Tag parameter for the construction function.
@param dep_tags Tags of dependencies for the constructor, to
be found in `verifiers`.
"""
assert tag not in verifiers
deps = [verifiers[d] for d in dep_tags]
verifiers[tag] = constructor(tag, deps)
def __init__(self, verifier_data, repair_data, host_class):
"""
Construct a `RepairStrategy` from simplified DAG data.
The input `verifier_data` object describes how to construct
verify nodes and the dependencies that relate them, as detailed
above.
The input `repair_data` object describes how to construct repair
actions and their dependencies and triggers, as detailed above.
@param verifier_data Iterable value with constructors for the
elements of the verification DAG and their
dependencies.
@param repair_data Iterable value with constructors for the
elements of the repair action list, and
their dependencies and triggers.
@property host_class A string identifier that identify what
class of host this repair strategy target
on, will be used as a field to send repair
metrics.
"""
# Metrics - we report on 'actions' for every repair action
# we execute; we report on 'strategy' for every complete
# repair operation.
self._strategy_counter = metrics.Counter(
'chromeos/autotest/repair/repair_strategy_v2')
self._actions_counter = metrics.Counter(
'chromeos/autotest/repair/repair_actions')
self.host_class = host_class
# We use the `all_verifiers` list to guarantee that our root
# verifier will execute its dependencies in the order provided
# to us by our caller.
verifier_map = {}
all_tags = []
dependencies = set()
for constructor, tag, deps in verifier_data:
self._add_verifier(verifier_map, constructor, tag, deps)
dependencies.update(deps)
all_tags.append(tag)
# Capture all the verifiers that have nothing depending on them.
root_tags = [t for t in all_tags if t not in dependencies]
self._add_verifier(verifier_map, _RootVerifier,
self.ROOT_TAG, root_tags)
self._verify_root = verifier_map[self.ROOT_TAG]
self._repair_actions = []
for constructor, tag, deps, triggers in repair_data:
r = constructor(tag,
[verifier_map[d] for d in deps],
[verifier_map[t] for t in triggers],
self.host_class)
self._repair_actions.append(r)
def _send_strategy_metrics(self, host, result):
"""Send repair strategy metrics to monarch
@param host The target to be repaired.
@param result A String that describe a final result for the
RepairStrategy.
"""
info = host.host_info_store.get()
board = info.board if info.board else 'unknown'
model = info.model if info.model else 'unknown'
fields = {
'board': board,
'host_class': self.host_class,
'hostname': _filter_metrics_hostname(host),
'model': model,
'result': result,
}
self._strategy_counter.increment(fields=fields)
def _send_action_metrics(self, host, ra):
"""Send repair action metrics to monarch
@param host The target to be repaired.
@param ra an RepairAction instance.
"""
fields = {
'tag': ra.tag,
'status': ra.status,
'hostname': _filter_metrics_hostname(host),
'host_class': self.host_class
}
self._actions_counter.increment(fields=fields)
def verify(self, host, silent=False):
"""
Run the verifier DAG on the given host.
@param host The target to be verified.
@param silent If true, don't log host status records.
"""
self._verify_root._reverify()
self._verify_root._verify_host(host, silent)
def repair(self, host, silent=False):
"""
Run the repair list on the given host.
@param host The target to be repaired.
@param silent If true, don't log host status records.
"""
self._verify_root._reverify()
attempted = False
for ra in self._repair_actions:
try:
ra._repair_host(host, silent)
except Exception as e:
# all logging and exception handling was done at
# lower levels
pass
finally:
self._send_action_metrics(host, ra)
if ra.status not in ('skipped', 'blocked'):
attempted = True
result = 'failure'
try:
self._verify_root._verify_host(host, silent)
result = 'success' if attempted else 'not_attempted'
except:
if not attempted:
result = 'attempt_blocked'
raise
finally:
self._send_strategy_metrics(host, result)
def _filter_metrics_hostname(host):
"""
Restrict format of hostnames we'll send to monarch
@param host An host instance(i.e. ServoHost, CrosHost)
"""
if re.match(_HOSTNAME_PATTERN, host.hostname):
return host.hostname
else:
return _DISALLOWED_HOSTNAME