# Copyright (c) 2011 The Chromium OS Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import logging import os import re import shutil from autotest_lib.client.common_lib import utils as client_utils from autotest_lib.client.common_lib.cros import dev_server from autotest_lib.client.common_lib.cros import retry from autotest_lib.client.cros import constants from autotest_lib.server.cros.dynamic_suite.constants import JOB_BUILD_KEY from autotest_lib.server.crashcollect import collect_log_file from autotest_lib.server import utils try: from chromite.lib import metrics except ImportError: metrics = client_utils.metrics_mock def generate_minidump_stacktrace(minidump_path): """ Generates a stacktrace for the specified minidump. This function expects the debug symbols to reside under: /build/<board>/usr/lib/debug @param minidump_path: absolute path to minidump to by symbolicated. @raise client_utils.error.CmdError if minidump_stackwalk return code != 0. """ symbol_dir = '%s/../../../lib/debug' % utils.get_server_dir() logging.info('symbol_dir: %s', symbol_dir) client_utils.run('minidump_stackwalk "%s" "%s" > "%s.txt"' % (minidump_path, symbol_dir, minidump_path)) def _resolve_crashserver(): """ Attempts to find a devserver / crashserver that has capacity to symbolicate a crashdump. @raises DevServerException if no server with capacity could be found. @returns Hostname of resolved server, if found. """ crashserver_name = dev_server.get_least_loaded_devserver( devserver_type=dev_server.CrashServer) if not crashserver_name: metrics.Counter('chromeos/autotest/crashcollect/could_not_resolve' ).increment() raise dev_server.DevServerException( 'No crash server has the capacity to symbolicate the dump.') else: metrics.Counter('chromeos/autotest/crashcollect/resolved' ).increment(fields={'crash_server': crashserver_name}) return crashserver_name def _symbolicate_minidump_with_devserver(minidump_path, resultdir, crashserver_name): """ Generates a stack trace for the specified minidump by consulting devserver. This function assumes the debug symbols have been staged on the devserver. @param minidump_path: absolute path to minidump to by symbolicated. @param resultdir: server job's result directory. @param crashserver_name: Name of crashserver to attempt to symbolicate with. @raise DevServerException upon failure, HTTP or otherwise. """ # First, look up what build we tested. If we can't find this, we can't # get the right debug symbols, so we might as well give up right now. keyvals = client_utils.read_keyval(resultdir) if JOB_BUILD_KEY not in keyvals: raise dev_server.DevServerException( 'Cannot determine build being tested.') devserver = dev_server.CrashServer(crashserver_name) with metrics.SecondsTimer( 'chromeos/autotest/crashcollect/symbolicate_duration', fields={'crash_server': crashserver_name}): trace_text = devserver.symbolicate_dump(minidump_path, keyvals[JOB_BUILD_KEY]) if not trace_text: raise dev_server.DevServerException('Unknown error!!') with open(minidump_path + '.txt', 'w') as trace_file: trace_file.write(trace_text) def generate_stacktrace_for_file(minidump, host_resultdir): """ Tries to generate a stack trace for the file located at |minidump|. @param minidump: path to minidump file to generate the stacktrace for. @param host_resultdir: server job's result directory. """ # First, try to symbolicate locally. try: logging.info('Trying to generate stack trace locally for %s', minidump) generate_minidump_stacktrace(minidump) logging.info('Generated stack trace for dump %s', minidump) return except client_utils.error.CmdError as err: logging.info('Failed to generate stack trace locally for ' 'dump %s (rc=%d):\n%r', minidump, err.result_obj.exit_status, err) # If that did not succeed, try to symbolicate using the dev server. try: logging.info('Generating stack trace using devserver for %s', minidump) crashserver_name = _resolve_crashserver() args = (minidump, host_resultdir, crashserver_name) is_timeout, _ = retry.timeout(_symbolicate_minidump_with_devserver, args=args, timeout_sec=600) if is_timeout: logging.info('Generating stack trace timed out for dump %s', minidump) metrics.Counter( 'chromeos/autotest/crashcollect/symbolicate_timed_out' ).increment(fields={'crash_server': crashserver_name}) else: logging.info('Generated stack trace for dump %s', minidump) return except dev_server.DevServerException as e: logging.info('Failed to generate stack trace on devserver for dump ' '%s:\n%r', minidump, e) # Symbolicating failed. logging.warning('Failed to generate stack trace for %s (see info logs)', minidump) def find_and_generate_minidump_stacktraces(host_resultdir): """ Finds all minidump files and generates a stack trace for each. Enumerates all files under the test results directory (recursively) and generates a stack trace file for the minidumps. Minidump files are identified as files with .dmp extension. The stack trace filename is composed by appending the .txt extension to the minidump filename. @param host_resultdir: Directory to walk looking for dmp files. @returns The list of all found minidump files. Each dump may or may not have been symbolized. """ minidumps = [] for file in _find_crashdumps(host_resultdir): generate_stacktrace_for_file(file, host_resultdir) minidumps.append(file) return minidumps def _find_crashdumps(host_resultdir): """Find crashdumps. @param host_resultdir The result directory for this host for this test run. """ for dir, subdirs, files in os.walk(host_resultdir): for file in files: if file.endswith('.dmp'): yield os.path.join(dir, file) def _find_orphaned_crashdumps(host): """Return file paths of crashdumps on host. @param host A host object of the device. """ return host.list_files_glob(os.path.join(constants.CRASH_DIR, '*')) def report_crashdumps(host): """Report on crashdumps for host. This is run when no tests failed. We don't process crashdumps in this case because of devserver load, but they should still be reported. @param host A host object of the device we're to pull crashes from. """ for crashfile in _find_orphaned_crashdumps(host): logging.warning('Host crashdump exists: %s', crashfile) host.job.record('INFO', None, None, 'Host crashdump exists: %s' % (crashfile,)) host_resultdir = _get_host_resultdir(host) for crashfile in _find_crashdumps(host_resultdir): logging.warning('Local crashdump exists: %s', crashfile) host.job.record('INFO', None, None, 'Local crashdump exists: %s' % (crashfile,)) def fetch_orphaned_crashdumps(host, infodir): """ Copy all of the crashes in the crash directory over to the results folder. @param host A host object of the device we're to pull crashes from. @param infodir The directory to fetch crashdumps into. @return The list of minidumps that we pulled back from the host. """ if not os.path.exists(infodir): os.mkdir(infodir) orphans = [] if not host.check_cached_up_status(): logging.warning('Host %s did not answer to ping, skip fetching ' 'orphaned crashdumps.', host.hostname) return orphans try: for file in _find_orphaned_crashdumps(host): logging.info('Collecting %s...', file) collect_log_file(host, file, infodir, clean=True) orphans.append(file) except Exception as e: logging.warning('Collection of orphaned crash dumps failed %s', e) finally: # Delete infodir if we have no orphans if not orphans: logging.info('There are no orphaned crashes; deleting %s', infodir) os.rmdir(infodir) return orphans def _copy_to_debug_dir(host_resultdir, filename): """ Copies a file to the debug dir under host_resultdir. @param host_resultdir The result directory for this host for this test run. @param filename The full path of the file to copy to the debug folder. """ debugdir = os.path.join(host_resultdir, 'debug') src = filename dst = os.path.join(debugdir, os.path.basename(filename)) try: shutil.copyfile(src, dst) logging.info('Copied %s to %s', src, dst) except IOError: logging.warning('Failed to copy %s to %s', src, dst) def _get_host_resultdir(host): """Get resultdir for host. @param host A host object of the device we're to pull crashes from. """ return getattr(getattr(host, 'job', None), 'resultdir', None) def get_host_infodir(host): """Get infodir for host. @param host A host object of the device we're to pull crashes from. """ host_resultdir = _get_host_resultdir(host) return os.path.join(host_resultdir, 'crashinfo.%s' % host.hostname) def get_site_crashdumps(host, test_start_time): """ Copy all of the crashdumps from a host to the results directory. @param host The host object from which to pull crashes @param test_start_time When the test we just ran started. @return A list of all the minidumps """ host_resultdir = _get_host_resultdir(host) infodir = get_host_infodir(host) orphans = fetch_orphaned_crashdumps(host, infodir) minidumps = find_and_generate_minidump_stacktraces(host_resultdir) # Record all crashdumps in status.log of the job: # - If one server job runs several client jobs we will only record # crashdumps in the status.log of the high level server job. # - We will record these crashdumps whether or not we successfully # symbolicate them. if host.job and minidumps or orphans: host.job.record('INFO', None, None, 'Start crashcollection record') for minidump in minidumps: host.job.record('INFO', None, 'New Crash Dump', minidump) for orphan in orphans: host.job.record('INFO', None, 'Orphaned Crash Dump', orphan) host.job.record('INFO', None, None, 'End crashcollection record') orphans.extend(minidumps) for minidump in orphans: report_bug_from_crash(host, minidump) # We copy Chrome crash information to the debug dir to assist debugging. # Since orphans occurred on a previous run, they are most likely not # relevant to the current failure, so we don't copy them. for minidump in minidumps: minidump_no_ext = os.path.splitext(minidump)[0] _copy_to_debug_dir(host_resultdir, minidump_no_ext + '.dmp.txt') _copy_to_debug_dir(host_resultdir, minidump_no_ext + '.log') return orphans def find_package_of(host, exec_name): """ Find the package that an executable came from. @param host A host object that has the executable. @param exec_name Name of or path to executable. @return The name of the package that installed the executable. """ # Run "portageq owners" on "host" to determine which package owns # "exec_name." Portageq queue output consists of package names followed # tab-prefixed path names. For example, owners of "python:" # # sys-devel/gdb-7.7.1-r2 # /usr/share/gdb/python # chromeos-base/dev-install-0.0.1-r711 # /usr/bin/python # dev-lang/python-2.7.3-r7 # /etc/env.d/python # # This gets piped into "xargs stat" to annotate each line with # information about the path, so we later can consider only packages # with executable files. After annotation the above looks like: # # stat: cannot stat '@@@ sys-devel/gdb-7.7.1-r2 @@@': ... # stat: cannot stat '/usr/share/gdb/python': ... # stat: cannot stat '@@@ chromeos-base/dev-install-0.0.1-r711 @@@': ... # 755 -rwxr-xr-x /usr/bin/python # stat: cannot stat '@@@ dev-lang/python-2.7.3-r7 @@@': ... # 755 drwxr-xr-x /etc/env.d/python # # Package names are surrounded by "@@@" to facilitate parsing. Lines # starting with an octal number were successfully annotated, because # the path existed on "host." # The above is then parsed to find packages which contain executable files # (not directories), in this case "chromeos-base/dev-install-0.0.1-r711." # # TODO(milleral): portageq can show scary looking error messages # in the debug logs via stderr. We only look at stdout, so those # get filtered, but it would be good to silence them. cmd = ('portageq owners / ' + exec_name + r'| sed -e "s/^[^\t].*/@@@ & @@@/" -e "s/^\t//"' r'| tr \\n \\0' ' | xargs -0 -r stat -L -c "%a %A %n" 2>&1') portageq = host.run(cmd, ignore_status=True) # Parse into a set of names of packages containing an executable file. packages = set() pkg = '' pkg_re = re.compile('@@@ (.*) @@@') path_re = re.compile('^([0-7]{3,}) (.)') for line in portageq.stdout.splitlines(): match = pkg_re.search(line) if match: pkg = match.group(1) continue match = path_re.match(line) if match: isexec = int(match.group(1), 8) & 0o111 isfile = match.group(2) == '-' if pkg and isexec and isfile: packages.add(pkg) # If exactly one package found it must be the one we want, return it. if len(packages) == 1: return packages.pop() # TODO(milleral): Decide if it really is an error if not exactly one # package is found. # It is highly questionable as to if this should be left in the # production version of this code or not. if len(packages) == 0: logging.warning('find_package_of() found no packages for "%s"', exec_name) else: logging.warning('find_package_of() found multiple packages for "%s": ' '%s', exec_name, ', '.join(packages)) return '' def report_bug_from_crash(host, minidump_path): """ Given a host to query and a minidump, file a bug about the crash. @param host A host object that is where the dump came from @param minidump_path The path to the dump file that should be reported. """ # TODO(milleral): Once this has actually been tested, remove the # try/except. In the meantime, let's make sure nothing dies because of # the fact that this code isn't very heavily tested. try: meta_path = os.path.splitext(minidump_path)[0] + '.meta' with open(meta_path, 'r') as f: for line in f.readlines(): parts = line.split('=') if parts[0] == 'exec_name': package = find_package_of(host, parts[1].strip()) if not package: package = '<unknown package>' logging.info('Would report crash on %s.', package) break except Exception as e: logging.warning('Crash detection failed with: %s', e)