普通文本  |  416行  |  14.86 KB

#!/usr/bin/python

# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""
This script crawls crbug. Sort-of.
Invocation:
    Get all bugs with labels, strings (in summary and/or comments):
        crbug_crawler.py --labels 'one two three'
                         --queries '"first query" "second query"'

    Get baddest open bugs of all time:
        crbug_crawler.py --reap

Tips:
    - Label based queries will return faster than text queries.
    - contrib/crbug_shell.py is a wrapper that allows you to incrementally
        filter search results using this script.
"""

import argparse
import cmd
import logging
import sys
import shlex

import common
from autotest_lib.client.common_lib import global_config
from autotest_lib.server.cros.dynamic_suite import reporting


def _parse_args(args):
    if not args:
        import crbug_crawler
        logging.error('Improper usage of crbug_crawler: %s',
                crbug_crawler.__doc__)
        sys.exit(1)

    description = ('Usage: crbug_crawler.py --reap')
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--quiet', help=('Turn off logging noise.'),
            action='store_true', default=False)
    parser.add_argument('--num', help='Number of issues to output.', default=10,
            type=int)
    parser.add_argument('--queries',
                        help=('Search query. Eg: --queries "%s %s"' %
                              ('build_Root', 'login')),
                        default='')
    parser.add_argument('--labels',
                        help=('Search labels. Eg: --labels "%s %s"' %
                              ('autofiled', 'Pri-1')), default=None)
    parser.add_argument('--reap', help=('Top autofiled bugs ordered by count.'),
            action='store_true', default=False)
    return parser.parse_args(args)


class Update(object):
    """Class encapsulating fields of an update to a bug.
    """
    open_statuses = ['Unconfirmed', 'Untriaged', 'Available', 'Assigned',
                     'Started', 'ExternalDependency']
    closed_statuses = ['Fixed', 'Verified', 'Duplicate', 'WontFix', 'Archived']

    def __init__(self, comment='', labels='', status=''):
        self.comment = comment
        self.labels = labels if labels else []
        self.status = status


    def __str__(self):
        msg = 'status: %s' % self.status
        if self.labels:
            msg = '%s labels: %s' % (msg, self.labels)
        if self.comment:
            msg = '%s comment: %s' % (msg, self.comment)
        return msg


class UpdateManager(object):
    """Update manager that allows you to revert status updates.

    This class keeps track of the last update applied and is capable
    of reverting it.
    """

    def __init__(self, autocommit=False):
        """Initialize update manager.

        @param autocommit: If False just print out the update instead
            of committing it.
        """
        self.history = {}
        self.present = {}
        self.reporter = reporting.Reporter()
        self.phapi_lib = self.reporter.get_bug_tracker_client()
        self.autocommit = autocommit


    def revert(self):
        """Only manages status reverts as of now.
        """
        for issue_id, update in self.history.iteritems():
            logging.warning('You will have to manually update %s and %s on %s',
                    self.present[issue_id].labels,
                    self.present[issue_id].comment, issue_id)
            # Create a new update with just the status.
            self.update(issue_id, Update(status=update.status))


    def update(self, old_issue, update):
        """Record the state of an issue before updating it.

        @param old_issue: The issue to update. If an id is specified an
            issue is constructed. If an issue object (as defined in phapi_lib
            Issue)is passed in, it is used directly.
        @param update: The Update object to apply to the issue.
        """
        if type(old_issue) == int:
            old_issue = self.phapi_lib.get_tracker_issue_by_id(old_issue)
        old_update = Update(
                labels=old_issue.labels, status=old_issue.status)

        if not update.status:
            update.status = old_update.status
        elif (update.status not in Update.open_statuses and
              update.status not in Update.closed_statuses):
            raise ValueError('Unknown status %s' % update.status)

        if not self.autocommit:
            logging.warning('Would have applied the following update: '
                    '%s -> %s', old_update, update)
            return

        self.history[old_issue.id] = old_update
        self.reporter.modify_bug_report(
                issue_id=old_issue.id, comment=update.comment,
                label_update=update.labels,
                status=update.status)
        self.present[old_issue.id] = update


class Crawler(object):
    """Class capable of crawling crbug.

    This class applies filters to issues it crawls and caches them locally.
    """

    # The limit at which we ask for confirmation to proceed with the crawl.
    PROMPT_LIMIT = 2000

    def __init__(self):
        self.reporter = reporting.Reporter()
        self.phapi_client = self.reporter.get_bug_tracker_client()
        self.issues = None
        self.all_autofiled_query = 'ANCHOR  TestFailure'
        self.all_autofiled_label = 'autofiled'
        self.prompted = False


    def fuzzy_search(self, query='', label='', fast=True):
        """Returns all issues using one query and/or one label.

        @param query: A string representing the query.
        @param label: A string representing the label.
        @param fast: If true, don't bother fetching comments.

        @return: A list of issues matching the query. If fast is
            specified the issues won't have comments.
        """
        if not query and not label:
            raise ValueError('Require query or labels to make a tracker query, '
                    'try query = "%s" or one of the predefined labels %s' %
                    (self.fuzzy_search_anchor(),
                     self.reporter._PREDEFINED_LABELS))
        if type(label) != str:
            raise ValueError('The crawler only supports one label per query, '
                    'and it must be a string. you supplied %s' % label)
        return self.phapi_client.get_tracker_issues_by_text(
                query, label=label, full_text=not fast)


    @staticmethod
    def _get_autofiled_count(issue):
        """Return the autofiled count.

        @param issue: An issue object that has labels.

        @return: An integer representing the autofiled count.
        """
        for label in issue.labels:
            if 'autofiled-count-' in label:
                return int(label.replace('autofiled-count-', ''))

        # Force bugs without autofiled-count to sink
        return 0


    def _prompt_crawl(self, new_issues, start_index):
        """Warn the user that a crawl is getting large.

        This method prompts for a y/n answer in case the user wants to abort the
        crawl and specify another set of labels/queries.

        @param new_issues: A list of issues used with the start_index to
            determine the number of issues already processed.
        @param start_index: The start index of the next crawl iteration.
        """
        logging.warning('Found %s issues, Crawling issues starting from %s',
                len(new_issues), start_index)
        if start_index > self.PROMPT_LIMIT and not self.prompted:
            logging.warning('Already crawled %s issues, it is possible that'
                    'you\'ve specified a very general label. If this is the '
                    'case consider re-rodering the labels so they start with '
                    'the rarest. Continue crawling [y/n]?',
                    start_index + len(new_issues))
            self.prompted = raw_input() == 'y'
            if not self.prompted:
                sys.exit(0)


    def exhaustive_crawl(self, query='', label='', fast=True):
        """Perform an exhaustive crawl using one label and query string.

        @param query: A string representing one query.
        @param lable: A string representing one label.

        @return A list of issues sorted by descending autofiled count.
        """
        start_index = 0
        self.phapi_client.set_max_results(200)
        logging.warning('Performing an exhaustive crawl with label %s query %s',
                label, query)
        vague_issues = []
        new_issues = self.fuzzy_search(query=query, label=label, fast=fast)
        while new_issues:
            vague_issues += new_issues
            start_index += len(new_issues) + 1
            self.phapi_client.set_start_index(start_index)
            new_issues = self.fuzzy_search(query=query, label=label,
                    fast=fast)
            self._prompt_crawl(new_issues, start_index)

        # Subsequent calls will clear the issues cache with new results.
        self.phapi_client.set_start_index(1)
        return sorted(vague_issues, reverse=True,
                      key=lambda issue: self._get_autofiled_count(issue))


    @staticmethod
    def filter_labels(issues, labels):
        """Takes a list of labels and returns matching issues.

        @param issues: A list of issues to parse for labels.
        @param labels: A list of labels to match.

        @return: A list of matching issues. The issues must contain
            all the labels specified.
        """
        if not labels:
            return issues
        matching_issues = set([])
        labels = set(labels)
        for issue in issues:
            issue_labels = set(issue.labels)
            if issue_labels.issuperset(labels):
                matching_issues.add(issue)
        return matching_issues


    @classmethod
    def does_query_match(cls, issue, query):
        """Check if a query matches the given issue.

        @param issue: The issue to check.
        @param query: The query to check against.

        @return: True if the query matches, false otherwise.
        """
        if query in issue.title or query in issue.summary:
            return True
        # We can only search comments if the issue is a complete issue
        # i.e as defined in phapi_lib.Issue.
        try:
            if any(query in comment for comment in issue.comments):
                return True
        except (AttributeError, TypeError):
            pass
        return False


    @classmethod
    def filter_queries(cls, issues, queries):
        """Take a list of queries and returns matching issues.

        @param issues: A list of issues to parse. If the issues contain
            comments and a query is not in the issues title or summmary,
            the comments are parsed for a substring match.
        @param queries: A list of queries to parse the issues for.
            This method looks for an exact substring match within each issue.

        @return: A list of matching issues.
        """
        if not queries:
            return issues
        matching_issues = set([])
        for issue in issues:
            # For each query, check if it's in the title, description or
            # comments. If a query isn't in any of these, discard the issue.
            for query in queries:
                if cls.does_query_match(issue, query):
                    matching_issues.add(issue)
                else:
                    if issue in matching_issues:
                        logging.warning('%s: %s\n \tPassed a subset of the '
                                'queries but failed query %s',
                                issue.id, issue.title, query)
                        matching_issues.remove(issue)
                    break
        return matching_issues


    def filter_issues(self, queries='', labels=None, fast=True):
        """Run the queries, labels filters by crawling crbug.

        @param queries: A space seperated string of queries, usually passed
            through the command line.
        @param labels: A space seperated string of labels, usually passed
            through the command line.
        @param fast: If specified, skip creating comments for issues since this
            can be a slow process. This value is only a suggestion, since it is
            ignored if multiple queries are specified.
        """
        queries = shlex.split(queries)
        labels = shlex.split(labels) if labels else None

        # We'll need comments to filter multiple queries.
        if len(queries) > 1:
            fast = False
        matching_issues = self.exhaustive_crawl(
                query=queries.pop(0) if queries else '',
                label=labels.pop(0) if labels else '', fast=fast)
        matching_issues = self.filter_labels(matching_issues, labels)
        matching_issues = self.filter_queries(matching_issues, queries)
        self.issues = list(matching_issues)


    def dump_issues(self, limit=None):
        """Print issues.
        """
        if limit and limit < len(self.issues):
            issues = self.issues[:limit]
        else:
            issues = self.issues
        #TODO: Modify formatting, include some paging etc.
        for issue in issues:
            try:
                print ('[%s] %s crbug.com/%s %s' %
                       (self._get_autofiled_count(issue),
                        issue.status, issue.id, issue.title))
            except UnicodeEncodeError as e:
                print "Unicdoe error decoding issue id %s" % issue.id
                continue


def _update_test(args):
    """A simple update test, to record usage.
    """
    updater = UpdateManager(autocommit=True)
    for issue in issues:
        updater.update(issue,
                       Update(comment='this is bogus', labels=['bogus'],
                              status='Assigned'))
    updater.revert()


def configure_logging(quiet=False):
    """Configure logging.

    @param quiet: True to turn off warning messages.
    """
    logging.basicConfig()
    logger = logging.getLogger()
    level = logging.WARNING
    if quiet:
        level = logging.ERROR
    logger.setLevel(level)


def main(args):
    crawler = Crawler()
    if args.reap:
        if args.queries or args.labels:
            logging.error('Query based ranking of bugs not supported yet.')
            return
        queries = ''
        labels = crawler.all_autofiled_label
    else:
        queries = args.queries
        labels = args.labels
    crawler.filter_issues(queries=queries, labels=labels,
            fast=False if queries else True)
    crawler.dump_issues(int(args.num))
    logging.warning('\nThis is a truncated list of %s results, use --num %s '
            'to get them all. If you want more informative results/better '
            'querying capabilities try crbug_shell.py.',
            args.num, len(crawler.issues))


if __name__ == '__main__':
    args = _parse_args(sys.argv[1:])
    configure_logging(args.quiet)
    main(args)