# Copyright 2015 The Chromium OS Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Services relating to DUT status and job history. The central abstraction of this module is the `HostJobHistory` class. This class provides two related pieces of information regarding a single DUT: * A history of tests and special tasks that have run on the DUT in a given time range. * Whether the DUT was "working" or "broken" at a given time. The "working" or "broken" status of a DUT is determined by the DUT's special task history. At the end of any job or task, the status is indicated as follows: * After any successful special task, the DUT is considered "working". * After any failed Repair task, the DUT is considered "broken". * After any other special task or after any regular test job, the DUT's status is considered unchanged. Definitions for terms used in the code below: * status task - Any special task that determines the DUT's status; that is, any successful task, or any failed Repair. * diagnosis interval - A time interval during which DUT status changed either from "working" to "broken", or vice versa. The interval starts with the last status task with the old status, and ends after the first status task with the new status. Diagnosis intervals are interesting because they normally contain the logs explaining a failure or repair event. """ import common import os from autotest_lib.frontend import setup_django_environment from django.db import models as django_models from autotest_lib.client.common_lib import global_config from autotest_lib.client.common_lib import site_utils from autotest_lib.client.common_lib import time_utils from autotest_lib.frontend.afe import models as afe_models from autotest_lib.site_utils.suite_scheduler import constants # Values used to describe the diagnosis of a DUT. These values are # used to indicate both DUT status after a job or task, and also # diagnosis of whether the DUT was working at the end of a given # time interval. # # UNUSED: Used when there are no events recorded in a given # time interval. # UNKNOWN: For an individual event, indicates that the DUT status # is unchanged from the previous event. For a time interval, # indicates that the DUT's status can't be determined from the # DUT's history. # WORKING: Indicates that the DUT was working normally after the # event, or at the end of the time interval. # BROKEN: Indicates that the DUT needed manual repair after the # event, or at the end of the time interval. # UNUSED = 0 UNKNOWN = 1 WORKING = 2 BROKEN = 3 def parse_time(time_string): """Parse time according to a canonical form. The "canonical" form is the form in which date/time values are stored in the database. @param time_string Time to be parsed. """ return int(time_utils.to_epoch_time(time_string)) class _JobEvent(object): """Information about an event in host history. This remembers the relevant data from a single event in host history. An event is any change in DUT state caused by a job or special task. The data captured are the start and end times of the event, the URL of logs to the job or task causing the event, and a diagnosis of whether the DUT was working or failed afterwards. This class is an adapter around the database model objects describing jobs and special tasks. This is an abstract superclass, with concrete subclasses for `HostQueueEntry` and `SpecialTask` objects. @property start_time Time the job or task began execution. @property end_time Time the job or task finished execution. @property id id of the event in the AFE database. @property name Name of the event, derived from the AFE database. @property job_status Short string describing the event's final status. @property logdir Relative path to the logs for the event's job. @property job_url URL to the logs for the event's job. @property gs_url GS URL to the logs for the event's job. @property job_id id of the AFE job for HQEs. None otherwise. @property diagnosis Working status of the DUT after the event. @property is_special Boolean indicating if the event is a special task. """ get_config_value = global_config.global_config.get_config_value _LOG_URL_PATTERN = get_config_value('CROS', 'log_url_pattern') @classmethod def get_log_url(cls, afe_hostname, logdir): """Return a URL to job results. The URL is constructed from a base URL determined by the global config, plus the relative path of the job's log directory. @param afe_hostname Hostname for autotest frontend @param logdir Relative path of the results log directory. @return A URL to the requested results log. """ return cls._LOG_URL_PATTERN % (afe_hostname, logdir) @classmethod def get_gs_url(cls, logdir): """Return a GS URL to job results. The URL is constructed from a base URL determined by the global config, plus the relative path of the job's log directory. @param logdir Relative path of the results log directory. @return A URL to the requested results log. """ return os.path.join(site_utils.get_offload_gsuri(), logdir) def __init__(self, start_time, end_time): self.start_time = parse_time(start_time) self.end_time = parse_time(end_time) def __cmp__(self, other): """Compare two jobs by their start time. This is a standard Python `__cmp__` method to allow sorting `_JobEvent` objects by their times. @param other The `_JobEvent` object to compare to `self`. """ return self.start_time - other.start_time @property def id(self): """Return the id of the event in the AFE database.""" raise NotImplementedError() @property def name(self): """Return the name of the event.""" raise NotImplementedError() @property def job_status(self): """Return a short string describing the event's final status.""" raise NotImplementedError() @property def logdir(self): """Return the relative path for this event's job logs.""" raise NotImplementedError() @property def job_url(self): """Return the URL for this event's job logs.""" raise NotImplementedError() @property def gs_url(self): """Return the GS URL for this event's job logs.""" raise NotImplementedError() @property def job_id(self): """Return the id of the AFE job for HQEs. None otherwise.""" raise NotImplementedError() @property def diagnosis(self): """Return the status of the DUT after this event. The diagnosis is interpreted as follows: UNKNOWN - The DUT status was the same before and after the event. WORKING - The DUT appeared to be working after the event. BROKEN - The DUT likely required manual intervention after the event. @return A valid diagnosis value. """ raise NotImplementedError() @property def is_special(self): """Return if the event is for a special task.""" raise NotImplementedError() class _SpecialTaskEvent(_JobEvent): """`_JobEvent` adapter for special tasks. This class wraps the standard `_JobEvent` interface around a row in the `afe_special_tasks` table. """ @classmethod def get_tasks(cls, afe, host_id, start_time, end_time): """Return special tasks for a host in a given time range. Return a list of `_SpecialTaskEvent` objects representing all special tasks that ran on the given host in the given time range. The list is ordered as it was returned by the query (i.e. unordered). @param afe Autotest frontend @param host_id Database host id of the desired host. @param start_time Start time of the range of interest. @param end_time End time of the range of interest. @return A list of `_SpecialTaskEvent` objects. """ query_start = time_utils.epoch_time_to_date_string(start_time) query_end = time_utils.epoch_time_to_date_string(end_time) tasks = afe.get_host_special_tasks( host_id, time_started__gte=query_start, time_finished__lte=query_end, is_complete=1) return [cls(afe.server, t) for t in tasks] @classmethod def get_status_task(cls, afe, host_id, end_time): """Return the task indicating a host's status at a given time. The task returned determines the status of the DUT; the diagnosis on the task indicates the diagnosis for the DUT at the given `end_time`. @param afe Autotest frontend @param host_id Database host id of the desired host. @param end_time Find status as of this time. @return A `_SpecialTaskEvent` object for the requested task, or `None` if no task was found. """ query_end = time_utils.epoch_time_to_date_string(end_time) task = afe.get_host_status_task(host_id, query_end) return cls(afe.server, task) if task else None def __init__(self, afe_hostname, afetask): self._afe_hostname = afe_hostname self._afetask = afetask super(_SpecialTaskEvent, self).__init__( afetask.time_started, afetask.time_finished) @property def id(self): return self._afetask.id @property def name(self): return self._afetask.task @property def job_status(self): if self._afetask.is_aborted: return 'ABORTED' elif self._afetask.success: return 'PASS' else: return 'FAIL' @property def logdir(self): return ('hosts/%s/%s-%s' % (self._afetask.host.hostname, self._afetask.id, self._afetask.task.lower())) @property def job_url(self): return _SpecialTaskEvent.get_log_url(self._afe_hostname, self.logdir) @property def gs_url(self): return _SpecialTaskEvent.get_gs_url(self.logdir) @property def job_id(self): return None @property def diagnosis(self): if self._afetask.success: return WORKING elif self._afetask.task == 'Repair': return BROKEN else: return UNKNOWN @property def is_special(self): return True class _TestJobEvent(_JobEvent): """`_JobEvent` adapter for regular test jobs. This class wraps the standard `_JobEvent` interface around a row in the `afe_host_queue_entries` table. """ @classmethod def get_hqes(cls, afe, host_id, start_time, end_time): """Return HQEs for a host in a given time range. Return a list of `_TestJobEvent` objects representing all the HQEs of all the jobs that ran on the given host in the given time range. The list is ordered as it was returned by the query (i.e. unordered). @param afe Autotest frontend @param host_id Database host id of the desired host. @param start_time Start time of the range of interest. @param end_time End time of the range of interest. @return A list of `_TestJobEvent` objects. """ query_start = time_utils.epoch_time_to_date_string(start_time) query_end = time_utils.epoch_time_to_date_string(end_time) hqelist = afe.get_host_queue_entries( host_id=host_id, start_time=query_start, end_time=query_end, complete=1) return [cls(afe.server, hqe) for hqe in hqelist] def __init__(self, afe_hostname, hqe): self._afe_hostname = afe_hostname self._hqe = hqe super(_TestJobEvent, self).__init__( hqe.started_on, hqe.finished_on) @property def id(self): return self._hqe.id @property def name(self): return self._hqe.job.name @property def job_status(self): return self._hqe.status @property def logdir(self): return _get_job_logdir(self._hqe.job) @property def job_url(self): return _TestJobEvent.get_log_url(self._afe_hostname, self.logdir) @property def gs_url(self): return _TestJobEvent.get_gs_url(self.logdir) @property def job_id(self): return self._hqe.job.id @property def diagnosis(self): return UNKNOWN @property def is_special(self): return False class HostJobHistory(object): """Class to query and remember DUT execution and status history. This class is responsible for querying the database to determine the history of a single DUT in a time interval of interest, and for remembering the query results for reporting. @property hostname Host name of the DUT. @property start_time Start of the requested time interval, as a unix timestamp (epoch time). This field may be `None`. @property end_time End of the requested time interval, as a unix timestamp (epoch time). @property _afe Autotest frontend for queries. @property _host Database host object for the DUT. @property _history A list of jobs and special tasks that ran on the DUT in the requested time interval, ordered in reverse, from latest to earliest. @property _status_interval A list of all the jobs and special tasks that ran on the DUT in the last diagnosis interval prior to `end_time`, ordered from latest to earliest. @property _status_diagnosis The DUT's status as of `end_time`. @property _status_task The DUT's last status task as of `end_time`. """ @classmethod def get_host_history(cls, afe, hostname, start_time, end_time): """Create a `HostJobHistory` instance for a single host. Simple factory method to construct host history from a hostname. Simply looks up the host in the AFE database, and passes it to the class constructor. @param afe Autotest frontend @param hostname Name of the host. @param start_time Start time for the history's time interval. @param end_time End time for the history's time interval. @return A new `HostJobHistory` instance. """ afehost = afe.get_hosts(hostname=hostname)[0] return cls(afe, afehost, start_time, end_time) @classmethod def get_multiple_histories(cls, afe, start_time, end_time, board=None, pool=None): """Create `HostJobHistory` instances for a set of hosts. The set of hosts can be specified as "all hosts of a given board type", "all hosts in a given pool", or "all hosts of a given board and pool". @param afe Autotest frontend @param start_time Start time for the history's time interval. @param end_time End time for the history's time interval. @param board All hosts must have this board type; if `None`, all boards are allowed. @param pool All hosts must be in this pool; if `None`, all pools are allowed. @return A list of new `HostJobHistory` instances. """ # If `board` or `pool` are both `None`, we could search the # entire database, which is more expensive than we want. # Our caller currently won't (can't) do this, but assert to # be safe. assert board is not None or pool is not None labels = [] if board is not None: labels.append(constants.Labels.BOARD_PREFIX + board) if pool is not None: labels.append(constants.Labels.POOL_PREFIX + pool) kwargs = {'multiple_labels': labels} hosts = afe.get_hosts(**kwargs) return [cls(afe, h, start_time, end_time) for h in hosts] def __init__(self, afe, afehost, start_time, end_time): self._afe = afe self.hostname = afehost.hostname self.end_time = end_time self.start_time = start_time self._host = afehost # Don't spend time on queries until they're needed. self._history = None self._status_interval = None self._status_diagnosis = None self._status_task = None def _get_history(self, start_time, end_time): """Get the list of events for the given interval.""" newtasks = _SpecialTaskEvent.get_tasks( self._afe, self._host.id, start_time, end_time) newhqes = _TestJobEvent.get_hqes( self._afe, self._host.id, start_time, end_time) newhistory = newtasks + newhqes newhistory.sort(reverse=True) return newhistory def __iter__(self): if self._history is None: self._history = self._get_history(self.start_time, self.end_time) return self._history.__iter__() def _extract_prefixed_label(self, prefix): labels = [l for l in self._host.labels if l.startswith(prefix)] return labels[0][len(prefix) : ] if labels else None @property def host(self): """Return the AFE host object for this history.""" return self._host @property def host_board(self): """Return the board name for this history's DUT.""" prefix = constants.Labels.BOARD_PREFIX return self._extract_prefixed_label(prefix) @property def host_pool(self): """Return the pool name for this history's DUT.""" prefix = constants.Labels.POOL_PREFIX return self._extract_prefixed_label(prefix) def _init_status_task(self): """Fill in `self._status_diagnosis` and `_status_task`.""" if self._status_diagnosis is not None: return self._status_task = _SpecialTaskEvent.get_status_task( self._afe, self._host.id, self.end_time) if self._status_task is not None: self._status_diagnosis = self._status_task.diagnosis else: self._status_diagnosis = UNKNOWN def _init_status_interval(self): """Fill in `self._status_interval`.""" if self._status_interval is not None: return self._init_status_task() self._status_interval = [] if self._status_task is None: return query_end = time_utils.epoch_time_to_date_string(self.end_time) interval = self._afe.get_host_diagnosis_interval( self._host.id, query_end, self._status_diagnosis != WORKING) if not interval: return self._status_interval = self._get_history( parse_time(interval[0]), parse_time(interval[1])) def diagnosis_interval(self): """Find this history's most recent diagnosis interval. Returns a list of `_JobEvent` instances corresponding to the most recent diagnosis interval occurring before this history's end time. The list is returned as with `self._history`, ordered from most to least recent. @return The list of the `_JobEvent`s in the diagnosis interval. """ self._init_status_interval() return self._status_interval def last_diagnosis(self): """Return the diagnosis of whether the DUT is working. This searches the DUT's job history, looking for the most recent status task for the DUT. Return a tuple of `(diagnosis, task)`. The `diagnosis` entry in the tuple is one of these values: * UNUSED - The host's last status task is older than `self.start_time`. * WORKING - The DUT is working. * BROKEN - The DUT likely requires manual intervention. * UNKNOWN - No task could be found indicating status for the DUT. If the DUT was working at last check, but hasn't been used inside this history's time interval, the status `UNUSED` is returned with the last status task, instead of `WORKING`. The `task` entry in the tuple is the status task that led to the diagnosis. The task will be `None` if the diagnosis is `UNKNOWN`. @return A tuple with the DUT's diagnosis and the task that determined it. """ self._init_status_task() diagnosis = self._status_diagnosis if (self.start_time is not None and self._status_task is not None and self._status_task.end_time < self.start_time and diagnosis == WORKING): diagnosis = UNUSED return diagnosis, self._status_task def get_diagnosis_interval(host_id, end_time, success): """Return the last diagnosis interval for a given host and time. This routine queries the database for the special tasks on a given host before a given time. From those tasks it selects the last status task before a change in status, and the first status task after the change. When `success` is true, the change must be from "working" to "broken". When false, the search is for a change in the opposite direction. A "successful status task" is any successful special task. A "failed status task" is a failed Repair task. These criteria are based on the definition of "status task" in the module-level docstring, above. This is the RPC endpoint for `AFE.get_host_diagnosis_interval()`. @param host_id Database host id of the desired host. @param end_time Find the last eligible interval before this time. @param success Whether the eligible interval should start with a success or a failure. @return A list containing the start time of the earliest job selected, and the end time of the latest job. """ base_query = afe_models.SpecialTask.objects.filter( host_id=host_id, is_complete=True) success_query = base_query.filter(success=True) failure_query = base_query.filter(success=False, task='Repair') if success: query0 = success_query query1 = failure_query else: query0 = failure_query query1 = success_query query0 = query0.filter(time_finished__lte=end_time) query0 = query0.order_by('time_started').reverse() if not query0: return [] task0 = query0[0] query1 = query1.filter(time_finished__gt=task0.time_finished) task1 = query1.order_by('time_started')[0] return [task0.time_started.strftime(time_utils.TIME_FMT), task1.time_finished.strftime(time_utils.TIME_FMT)] def get_status_task(host_id, end_time): """Get the last status task for a host before a given time. This routine returns a Django query for the AFE database to find the last task that finished on the given host before the given time that was either a successful task, or a Repair task. The query criteria are based on the definition of "status task" in the module-level docstring, above. This is the RPC endpoint for `_SpecialTaskEvent.get_status_task()`. @param host_id Database host id of the desired host. @param end_time End time of the range of interest. @return A Django query-set selecting the single special task of interest. """ # Selects status tasks: any Repair task, or any successful task. status_tasks = (django_models.Q(task='Repair') | django_models.Q(success=True)) # Our caller needs a Django query set in order to serialize the # result, so we don't resolve the query here; we just return a # slice with at most one element. return afe_models.SpecialTask.objects.filter( status_tasks, host_id=host_id, time_finished__lte=end_time, is_complete=True).order_by('time_started').reverse()[0:1] def _get_job_logdir(job): """Gets the logdir for an AFE job. @param job Job object which has id and owner properties. @return Relative path of the results log directory. """ return '%s-%s' % (job.id, job.owner) def get_job_gs_url(job): """Gets the GS URL for an AFE job. @param job Job object which has id and owner properties. @return Absolute GS URL to the results log directory. """ return _JobEvent.get_gs_url(_get_job_logdir(job))