// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
#define CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
#pragma once
#include <functional>
#include <string>
#include <vector>
class GURL;
// A pattern that can be used to match URLs. A URLPattern is a very restricted
// subset of URL syntax:
//
// <url-pattern> := <scheme>://<host><path> | '<all_urls>'
// <scheme> := '*' | 'http' | 'https' | 'file' | 'ftp' | 'chrome'
// <host> := '*' | '*.' <anychar except '/' and '*'>+
// <path> := '/' <any chars>
//
// * Host is not used when the scheme is 'file'.
// * The path can have embedded '*' characters which act as glob wildcards.
// * '<all_urls>' is a special pattern that matches any URL that contains a
// valid scheme (as specified by valid_schemes_).
// * The '*' scheme pattern excludes file URLs.
//
// Examples of valid patterns:
// - http://*/*
// - http://*/foo*
// - https://*.google.com/foo*bar
// - file://monkey*
// - http://127.0.0.1/*
//
// Examples of invalid patterns:
// - http://* -- path not specified
// - http://*foo/bar -- * not allowed as substring of host component
// - http://foo.*.bar/baz -- * must be first component
// - http:/bar -- scheme separator not found
// - foo://* -- invalid scheme
// - chrome:// -- we don't support chrome internal URLs
//
// Design rationale:
// * We need to be able to tell users what 'sites' a given URLPattern will
// affect. For example "This extension will interact with the site
// 'www.google.com'.
// * We'd like to be able to convert as many existing Greasemonkey @include
// patterns to URLPatterns as possible. Greasemonkey @include patterns are
// simple globs, so this won't be perfect.
// * Although we would like to support any scheme, it isn't clear what to tell
// users about URLPatterns that affect data or javascript URLs, so those are
// left out for now.
//
// From a 2008-ish crawl of userscripts.org, the following patterns were found
// in @include lines:
// - total lines : 24471
// - @include * : 919
// - @include http://[^\*]+?/ : 11128 (no star in host)
// - @include http://\*\.[^\*]+?/ : 2325 (host prefixed by *.)
// - @include http://\*[^\.][^\*]+?/: 1524 (host prefixed by *, no dot -- many
// appear to only need subdomain
// matching, not real prefix matching)
// - @include http://[^\*/]+\*/ : 320 (host suffixed by *)
// - @include contains .tld : 297 (host suffixed by .tld -- a special
// Greasemonkey domain component that
// tries to match all valid registry-
// controlled suffixes)
// - @include http://\*/ : 228 (host is * exactly, but there is
// more to the pattern)
//
// So, we can support at least half of current @include lines without supporting
// subdomain matching. We can pick up at least another 10% by supporting
// subdomain matching. It is probably possible to coerce more of the existing
// patterns to URLPattern, but the resulting pattern will be more restrictive
// than the original glob, which is probably better than nothing.
class URLPattern {
public:
// A collection of scheme bitmasks for use with valid_schemes.
enum SchemeMasks {
SCHEME_NONE = 0,
SCHEME_HTTP = 1 << 0,
SCHEME_HTTPS = 1 << 1,
SCHEME_FILE = 1 << 2,
SCHEME_FTP = 1 << 3,
SCHEME_CHROMEUI = 1 << 4,
SCHEME_FILESYSTEM = 1 << 5,
// SCHEME_ALL will match every scheme, including chrome://, chrome-
// extension://, about:, etc. Because this has lots of security
// implications, third-party extensions should never be able to get access
// to URL patterns initialized this way. It should only be used for internal
// Chrome code.
SCHEME_ALL = -1,
};
// Options for URLPattern::Parse().
enum ParseOption {
PARSE_LENIENT,
PARSE_STRICT
};
// Error codes returned from Parse().
enum ParseResult {
PARSE_SUCCESS = 0,
PARSE_ERROR_MISSING_SCHEME_SEPARATOR,
PARSE_ERROR_INVALID_SCHEME,
PARSE_ERROR_WRONG_SCHEME_SEPARATOR,
PARSE_ERROR_EMPTY_HOST,
PARSE_ERROR_INVALID_HOST_WILDCARD,
PARSE_ERROR_EMPTY_PATH,
PARSE_ERROR_HAS_COLON, // Only checked when strict checks are enabled.
NUM_PARSE_RESULTS
};
// The <all_urls> string pattern.
static const char kAllUrlsPattern[];
// Construct an URLPattern with the given set of allowable schemes. See
// valid_schemes_ for more info.
explicit URLPattern(int valid_schemes);
// Convenience to construct a URLPattern from a string. The string is expected
// to be a valid pattern. If the string is not known ahead of time, use
// Parse() instead, which returns success or failure.
URLPattern(int valid_schemes, const std::string& pattern);
#if defined(_MSC_VER) && _MSC_VER >= 1600
// Note: don't use this directly. This exists so URLPattern can be used
// with STL containers. Starting with Visual Studio 2010, we can't have this
// method private and use "friend class std::vector<URLPattern>;" as we used
// to do.
URLPattern();
#endif
~URLPattern();
// Gets the bitmask of valid schemes.
int valid_schemes() const { return valid_schemes_; }
void set_valid_schemes(int valid_schemes) { valid_schemes_ = valid_schemes; }
// Gets the host the pattern matches. This can be an empty string if the
// pattern matches all hosts (the input was <scheme>://*/<whatever>).
const std::string& host() const { return host_; }
void set_host(const std::string& host) { host_ = host; }
// Gets whether to match subdomains of host().
bool match_subdomains() const { return match_subdomains_; }
void set_match_subdomains(bool val) { match_subdomains_ = val; }
// Gets the path the pattern matches with the leading slash. This can have
// embedded asterisks which are interpreted using glob rules.
const std::string& path() const { return path_; }
void SetPath(const std::string& path);
// Returns true if this pattern matches all urls.
bool match_all_urls() const { return match_all_urls_; }
void set_match_all_urls(bool val) { match_all_urls_ = val; }
// Initializes this instance by parsing the provided string. Returns
// URLPattern::PARSE_SUCCESS on success, or an error code otherwise. On
// failure, this instance will have some intermediate values and is in an
// invalid state. Adding error checks to URLPattern::Parse() can cause
// patterns in installed extensions to fail. If an installed extension
// uses a pattern that was valid but fails a new error check, the
// extension will fail to load when chrome is auto-updated. To avoid
// this, new parse checks are enabled only when |strictness| is
// OPTION_STRICT. OPTION_STRICT should be used when loading in developer
// mode, or when an extension's patterns are controlled by chrome (such
// as component extensions).
ParseResult Parse(const std::string& pattern_str,
ParseOption strictness);
// Sets the scheme for pattern matches. This can be a single '*' if the
// pattern matches all valid schemes (as defined by the valid_schemes_
// property). Returns false on failure (if the scheme is not valid).
bool SetScheme(const std::string& scheme);
// Note: You should use MatchesScheme() instead of this getter unless you
// absolutely need the exact scheme. This is exposed for testing.
const std::string& scheme() const { return scheme_; }
// Returns true if the specified scheme can be used in this URL pattern, and
// false otherwise. Uses valid_schemes_ to determine validity.
bool IsValidScheme(const std::string& scheme) const;
// Returns true if this instance matches the specified URL.
bool MatchesUrl(const GURL& url) const;
// Returns true if |test| matches our scheme.
bool MatchesScheme(const std::string& test) const;
// Returns true if |test| matches our host.
bool MatchesHost(const std::string& test) const;
bool MatchesHost(const GURL& test) const;
// Returns true if |test| matches our path.
bool MatchesPath(const std::string& test) const;
// Returns a string representing this instance.
std::string GetAsString() const;
// Determine whether there is a URL that would match this instance and another
// instance. This method is symmetrical: Calling other.OverlapsWith(this)
// would result in the same answer.
bool OverlapsWith(const URLPattern& other) const;
// Convert this URLPattern into an equivalent set of URLPatterns that don't
// use a wildcard in the scheme component. If this URLPattern doesn't use a
// wildcard scheme, then the returned set will contain one element that is
// equivalent to this instance.
std::vector<URLPattern> ConvertToExplicitSchemes() const;
static bool EffectiveHostCompare(const URLPattern& a, const URLPattern& b) {
if (a.match_all_urls_ && b.match_all_urls_)
return false;
return a.host_.compare(b.host_) < 0;
};
// Used for origin comparisons in a std::set.
class EffectiveHostCompareFunctor {
public:
bool operator()(const URLPattern& a, const URLPattern& b) const {
return EffectiveHostCompare(a, b);
};
};
// Get an error string for a ParseResult.
static const char* GetParseResultString(URLPattern::ParseResult parse_result);
private:
#if !(defined(_MSC_VER) && _MSC_VER >= 1600)
friend class std::vector<URLPattern>;
// Note: don't use this directly. This exists so URLPattern can be used
// with STL containers.
URLPattern();
#endif
// A bitmask containing the schemes which are considered valid for this
// pattern. Parse() uses this to decide whether a pattern contains a valid
// scheme. MatchesScheme uses this to decide whether a wildcard scheme_
// matches a given test scheme.
int valid_schemes_;
// True if this is a special-case "<all_urls>" pattern.
bool match_all_urls_;
// The scheme for the pattern.
std::string scheme_;
// The host without any leading "*" components.
std::string host_;
// Whether we should match subdomains of the host. This is true if the first
// component of the pattern's host was "*".
bool match_subdomains_;
// The path to match. This is everything after the host of the URL, or
// everything after the scheme in the case of file:// URLs.
std::string path_;
// The path with "?" and "\" characters escaped for use with the
// MatchPattern() function.
std::string path_escaped_;
};
typedef std::vector<URLPattern> URLPatternList;
#endif // CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_