C++程序  |  427行  |  17 KB

// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_URL_MATCHER_URL_MATCHER_H_
#define COMPONENTS_URL_MATCHER_URL_MATCHER_H_

#include <set>
#include <vector>

#include "base/memory/ref_counted.h"
#include "base/memory/scoped_ptr.h"
#include "base/memory/scoped_vector.h"
#include "components/url_matcher/regex_set_matcher.h"
#include "components/url_matcher/substring_set_matcher.h"
#include "components/url_matcher/url_matcher_export.h"

class GURL;

namespace base {
class DictionaryValue;
}

namespace url_matcher {

// This class represents a single URL matching condition, e.g. a match on the
// host suffix or the containment of a string in the query component of a GURL.
//
// The difference from a simple StringPattern is that this also supports
// checking whether the {Host, Path, Query} of a URL contains a string. The
// reduction of URL matching conditions to StringPatterns conducted by
// URLMatcherConditionFactory is not capable of expressing that alone.
//
// Also supported is matching regular expressions against the URL (URL_MATCHES).
class URL_MATCHER_EXPORT URLMatcherCondition {
 public:
  enum Criterion {
    HOST_PREFIX,
    HOST_SUFFIX,
    HOST_CONTAINS,
    HOST_EQUALS,
    PATH_PREFIX,
    PATH_SUFFIX,
    PATH_CONTAINS,
    PATH_EQUALS,
    QUERY_PREFIX,
    QUERY_SUFFIX,
    QUERY_CONTAINS,
    QUERY_EQUALS,
    HOST_SUFFIX_PATH_PREFIX,
    HOST_EQUALS_PATH_PREFIX,
    URL_PREFIX,
    URL_SUFFIX,
    URL_CONTAINS,
    URL_EQUALS,
    URL_MATCHES,
    ORIGIN_AND_PATH_MATCHES,  // Matches the URL minus its query string.
  };

  URLMatcherCondition();
  ~URLMatcherCondition();
  URLMatcherCondition(Criterion criterion,
                      const StringPattern* substring_pattern);
  URLMatcherCondition(const URLMatcherCondition& rhs);
  URLMatcherCondition& operator=(const URLMatcherCondition& rhs);
  bool operator<(const URLMatcherCondition& rhs) const;

  Criterion criterion() const { return criterion_; }
  const StringPattern* string_pattern() const {
    return string_pattern_;
  }

  // Returns whether this URLMatcherCondition needs to be executed on a
  // full URL rather than the individual components (see
  // URLMatcherConditionFactory).
  bool IsFullURLCondition() const;

  // Returns whether this URLMatcherCondition is a regular expression to be
  // handled by a regex matcher instead of a substring matcher.
  bool IsRegexCondition() const;

  // Returns whether this URLMatcherCondition is a regular expression that shall
  // be evaluated on the URL without the query parameter.
  bool IsOriginAndPathRegexCondition() const;

  // Returns whether this condition is fulfilled according to
  // |matching_patterns| and |url|.
  bool IsMatch(const std::set<StringPattern::ID>& matching_patterns,
               const GURL& url) const;

 private:
  // |criterion_| and |string_pattern_| describe together what property a URL
  // needs to fulfill to be considered a match.
  Criterion criterion_;

  // This is the StringPattern that is used in a SubstringSetMatcher.
  const StringPattern* string_pattern_;
};

// Class to map the problem of finding {host, path, query} {prefixes, suffixes,
// containments, and equality} in GURLs to the substring matching problem.
//
// Say, you want to check whether the path of a URL starts with "/index.html".
// This class preprocesses a URL like "www.google.com/index.html" into something
// like "www.google.com|/index.html". After preprocessing, you can search for
// "|/index.html" in the string and see that this candidate URL actually has
// a path that starts with "/index.html". On the contrary,
// "www.google.com/images/index.html" would be normalized to
// "www.google.com|/images/index.html". It is easy to see that it contains
// "/index.html" but the path of the URL does not start with "/index.html".
//
// This preprocessing is important if you want to match a URL against many
// patterns because it reduces the matching to a "discover all substrings
// of a dictionary in a text" problem, which can be solved very efficiently
// by the Aho-Corasick algorithm.
//
// IMPORTANT: The URLMatcherConditionFactory owns the StringPattern
// referenced by created URLMatcherConditions. Therefore, it must outlive
// all created URLMatcherCondition and the SubstringSetMatcher.
class URL_MATCHER_EXPORT URLMatcherConditionFactory {
 public:
  URLMatcherConditionFactory();
  ~URLMatcherConditionFactory();

  // Canonicalizes a URL for "Create{Host,Path,Query}*Condition" searches.
  std::string CanonicalizeURLForComponentSearches(const GURL& url) const;

  // Factory methods for various condition types.
  //
  // Note that these methods fill the pattern_singletons_. If you create
  // conditions and don't register them to a URLMatcher, they will continue to
  // consume memory. You need to call ForgetUnusedPatterns() or
  // URLMatcher::ClearUnusedConditionSets() in this case.
  URLMatcherCondition CreateHostPrefixCondition(const std::string& prefix);
  URLMatcherCondition CreateHostSuffixCondition(const std::string& suffix);
  URLMatcherCondition CreateHostContainsCondition(const std::string& str);
  URLMatcherCondition CreateHostEqualsCondition(const std::string& str);

  URLMatcherCondition CreatePathPrefixCondition(const std::string& prefix);
  URLMatcherCondition CreatePathSuffixCondition(const std::string& suffix);
  URLMatcherCondition CreatePathContainsCondition(const std::string& str);
  URLMatcherCondition CreatePathEqualsCondition(const std::string& str);

  URLMatcherCondition CreateQueryPrefixCondition(const std::string& prefix);
  URLMatcherCondition CreateQuerySuffixCondition(const std::string& suffix);
  URLMatcherCondition CreateQueryContainsCondition(const std::string& str);
  URLMatcherCondition CreateQueryEqualsCondition(const std::string& str);

  // This covers the common case, where you don't care whether a domain
  // "foobar.com" is expressed as "foobar.com" or "www.foobar.com", and it
  // should be followed by a given |path_prefix|.
  URLMatcherCondition CreateHostSuffixPathPrefixCondition(
      const std::string& host_suffix,
      const std::string& path_prefix);
  URLMatcherCondition CreateHostEqualsPathPrefixCondition(
      const std::string& host,
      const std::string& path_prefix);

  // Canonicalizes a URL for "CreateURL*Condition" searches.
  std::string CanonicalizeURLForFullSearches(const GURL& url) const;

  // Canonicalizes a URL for "CreateURLMatchesCondition" searches.
  std::string CanonicalizeURLForRegexSearches(const GURL& url) const;
  // Canonicalizes a URL for "CreateOriginAndPathMatchesCondition" searches.
  std::string CanonicalizeURLForOriginAndPathRegexSearches(
      const GURL& url) const;

  URLMatcherCondition CreateURLPrefixCondition(const std::string& prefix);
  URLMatcherCondition CreateURLSuffixCondition(const std::string& suffix);
  URLMatcherCondition CreateURLContainsCondition(const std::string& str);
  URLMatcherCondition CreateURLEqualsCondition(const std::string& str);

  URLMatcherCondition CreateURLMatchesCondition(const std::string& regex);
  URLMatcherCondition CreateOriginAndPathMatchesCondition(
      const std::string& regex);

  // Removes all patterns from |pattern_singletons_| that are not listed in
  // |used_patterns|. These patterns are not referenced any more and get
  // freed.
  void ForgetUnusedPatterns(
      const std::set<StringPattern::ID>& used_patterns);

  // Returns true if this object retains no allocated data. Only for debugging.
  bool IsEmpty() const;

 private:
  // Creates a URLMatcherCondition according to the parameters passed.
  // The URLMatcherCondition will refer to a StringPattern that is
  // owned by |pattern_singletons_|.
  URLMatcherCondition CreateCondition(URLMatcherCondition::Criterion criterion,
                                      const std::string& pattern);

  // Prepends a "." to the hostname if it does not start with one.
  std::string CanonicalizeHostname(const std::string& hostname) const;

  // Convert the query string to canonical form suitable for key token search.
  std::string CanonicalizeQuery(std::string query,
                                bool prepend_beginning_of_query_component,
                                bool append_end_of_query_component) const;

  // Counter that ensures that all created StringPatterns have unique IDs.
  // Note that substring patterns and regex patterns will use different IDs.
  int id_counter_;

  // This comparison considers only the pattern() value of the
  // StringPatterns.
  struct StringPatternPointerCompare {
    bool operator()(StringPattern* lhs, StringPattern* rhs) const;
  };
  // Set to ensure that we generate only one StringPattern for each content
  // of StringPattern::pattern().
  typedef std::set<StringPattern*, StringPatternPointerCompare>
      PatternSingletons;
  PatternSingletons substring_pattern_singletons_;
  PatternSingletons regex_pattern_singletons_;
  PatternSingletons origin_and_path_regex_pattern_singletons_;

  DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionFactory);
};

// This class represents a single URL query matching condition. The query
// matching is done as a search for a key and optionally a value.
// The matching makes use of CanonicalizeURLForComponentSearches to ensure that
// the key starts and ends (optionally) with the right marker.
class URL_MATCHER_EXPORT URLQueryElementMatcherCondition {
 public:
  // Multiple occurrences of the same key can happen in a URL query. The type
  // ensures that every (MATCH_ALL), any (MATCH_ANY), first (MATCH_FIRST) or
  // last (MATCH_LAST) instance of the key occurrence matches the value.
  enum Type { MATCH_ANY, MATCH_FIRST, MATCH_LAST, MATCH_ALL };

  // Allows the match to be exact (QUERY_VALUE_MATCH_EXACT, starts and ends with
  // a delimiter or a border) or simply a prefix (QUERY_VALUE_MATCH_PREFIX,
  // starts with a delimiter or a border).
  enum QueryValueMatchType {
    QUERY_VALUE_MATCH_EXACT,
    QUERY_VALUE_MATCH_PREFIX
  };

  // Used to indicate if the query parameter is of type &key=value&
  // (ELEMENT_TYPE_KEY_VALUE) or simply &key& (ELEMENT_TYPE_KEY).
  enum QueryElementType { ELEMENT_TYPE_KEY_VALUE, ELEMENT_TYPE_KEY };

  URLQueryElementMatcherCondition(const std::string& key,
                                  const std::string& value,
                                  QueryValueMatchType query_value_match_type,
                                  QueryElementType query_element_type,
                                  Type match_type,
                                  URLMatcherConditionFactory* factory);
  ~URLQueryElementMatcherCondition();

  bool operator<(const URLQueryElementMatcherCondition& rhs) const;

  // Returns whether the URL query satisfies the key value constraint.
  bool IsMatch(const std::string& canonical_url_query) const;

  const StringPattern* string_pattern() const { return string_pattern_; }

 private:
  Type match_type_;
  std::string key_;
  std::string value_;
  size_t key_length_;
  size_t value_length_;
  const StringPattern* string_pattern_;
};

// This class represents a filter for the URL scheme to be hooked up into a
// URLMatcherConditionSet.
class URL_MATCHER_EXPORT URLMatcherSchemeFilter {
 public:
  explicit URLMatcherSchemeFilter(const std::string& filter);
  explicit URLMatcherSchemeFilter(const std::vector<std::string>& filters);
  ~URLMatcherSchemeFilter();
  bool IsMatch(const GURL& url) const;

 private:
  std::vector<std::string> filters_;

  DISALLOW_COPY_AND_ASSIGN(URLMatcherSchemeFilter);
};

// This class represents a filter for port numbers to be hooked up into a
// URLMatcherConditionSet.
class URL_MATCHER_EXPORT URLMatcherPortFilter {
 public:
  // Boundaries of a port range (both ends are included).
  typedef std::pair<int, int> Range;
  explicit URLMatcherPortFilter(const std::vector<Range>& ranges);
  ~URLMatcherPortFilter();
  bool IsMatch(const GURL& url) const;

  // Creates a port range [from, to]; both ends are included.
  static Range CreateRange(int from, int to);
  // Creates a port range containing a single port.
  static Range CreateRange(int port);

 private:
  std::vector<Range> ranges_;

  DISALLOW_COPY_AND_ASSIGN(URLMatcherPortFilter);
};

// This class represents a set of conditions that all need to match on a
// given URL in order to be considered a match.
class URL_MATCHER_EXPORT URLMatcherConditionSet
    : public base::RefCounted<URLMatcherConditionSet> {
 public:
  typedef int ID;
  typedef std::set<URLMatcherCondition> Conditions;
  typedef std::set<URLQueryElementMatcherCondition> QueryConditions;
  typedef std::vector<scoped_refptr<URLMatcherConditionSet> > Vector;

  // Matches if all conditions in |conditions| are fulfilled.
  URLMatcherConditionSet(ID id, const Conditions& conditions);

  // Matches if all conditions in |conditions|, |scheme_filter| and
  // |port_filter| are fulfilled. |scheme_filter| and |port_filter| may be NULL,
  // in which case, no restrictions are imposed on the scheme/port of a URL.
  URLMatcherConditionSet(ID id, const Conditions& conditions,
                         scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
                         scoped_ptr<URLMatcherPortFilter> port_filter);

  // Matches if all conditions in |conditions|, |query_conditions|,
  // |scheme_filter| and |port_filter| are fulfilled. |scheme_filter| and
  // |port_filter| may be NULL, in which case, no restrictions are imposed on
  // the scheme/port of a URL.
  URLMatcherConditionSet(ID id,
                         const Conditions& conditions,
                         const QueryConditions& query_conditions,
                         scoped_ptr<URLMatcherSchemeFilter> scheme_filter,
                         scoped_ptr<URLMatcherPortFilter> port_filter);

  ID id() const { return id_; }
  const Conditions& conditions() const { return conditions_; }
  const QueryConditions& query_conditions() const { return query_conditions_; }

  bool IsMatch(const std::set<StringPattern::ID>& matching_patterns,
               const GURL& url) const;

  bool IsMatch(const std::set<StringPattern::ID>& matching_patterns,
               const GURL& url,
               const std::string& url_for_component_searches) const;

 private:
  friend class base::RefCounted<URLMatcherConditionSet>;
  ~URLMatcherConditionSet();
  ID id_;
  Conditions conditions_;
  QueryConditions query_conditions_;
  scoped_ptr<URLMatcherSchemeFilter> scheme_filter_;
  scoped_ptr<URLMatcherPortFilter> port_filter_;

  DISALLOW_COPY_AND_ASSIGN(URLMatcherConditionSet);
};

// This class allows matching one URL against a large set of
// URLMatcherConditionSets at the same time.
class URL_MATCHER_EXPORT URLMatcher {
 public:
  URLMatcher();
  ~URLMatcher();

  // Adds new URLMatcherConditionSet to this URL Matcher. Each condition set
  // must have a unique ID.
  // This is an expensive operation as it triggers pre-calculations on the
  // currently registered condition sets. Do not call this operation many
  // times with a single condition set in each call.
  void AddConditionSets(const URLMatcherConditionSet::Vector& condition_sets);

  // Removes the listed condition sets. All |condition_set_ids| must be
  // currently registered. This function should be called with large batches
  // of |condition_set_ids| at a time to improve performance.
  void RemoveConditionSets(
      const std::vector<URLMatcherConditionSet::ID>& condition_set_ids);

  // Removes all unused condition sets from the ConditionFactory.
  void ClearUnusedConditionSets();

  // Returns the IDs of all URLMatcherConditionSet that match to this |url|.
  std::set<URLMatcherConditionSet::ID> MatchURL(const GURL& url) const;

  // Returns the URLMatcherConditionFactory that must be used to create
  // URLMatcherConditionSets for this URLMatcher.
  URLMatcherConditionFactory* condition_factory() {
    return &condition_factory_;
  }

  // Returns true if this object retains no allocated data. Only for debugging.
  bool IsEmpty() const;

 private:
  void UpdateSubstringSetMatcher(bool full_url_conditions);
  void UpdateRegexSetMatcher();
  void UpdateTriggers();
  void UpdateConditionFactory();
  void UpdateInternalDatastructures();

  URLMatcherConditionFactory condition_factory_;

  // Maps the ID of a URLMatcherConditionSet to the respective
  // URLMatcherConditionSet.
  typedef std::map<URLMatcherConditionSet::ID,
                   scoped_refptr<URLMatcherConditionSet> >
      URLMatcherConditionSets;
  URLMatcherConditionSets url_matcher_condition_sets_;

  // Maps a StringPattern ID to the URLMatcherConditions that need to
  // be triggered in case of a StringPattern match.
  typedef std::map<StringPattern::ID, std::set<URLMatcherConditionSet::ID> >
      StringPatternTriggers;
  StringPatternTriggers substring_match_triggers_;

  SubstringSetMatcher full_url_matcher_;
  SubstringSetMatcher url_component_matcher_;
  RegexSetMatcher regex_set_matcher_;
  RegexSetMatcher origin_and_path_regex_set_matcher_;
  std::set<const StringPattern*> registered_full_url_patterns_;
  std::set<const StringPattern*> registered_url_component_patterns_;

  DISALLOW_COPY_AND_ASSIGN(URLMatcher);
};

}  // namespace url_matcher

#endif  // COMPONENTS_URL_MATCHER_URL_MATCHER_H_