// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "chrome/common/extensions/url_pattern.h" #include "base/string_piece.h" #include "base/string_split.h" #include "base/string_util.h" #include "chrome/common/url_constants.h" #include "googleurl/src/gurl.h" #include "googleurl/src/url_util.h" const char URLPattern::kAllUrlsPattern[] = "<all_urls>"; namespace { // TODO(aa): Consider adding chrome-extension? What about more obscure ones // like data: and javascript: ? // Note: keep this array in sync with kValidSchemeMasks. const char* kValidSchemes[] = { chrome::kHttpScheme, chrome::kHttpsScheme, chrome::kFileScheme, chrome::kFtpScheme, chrome::kChromeUIScheme, chrome::kFileSystemScheme, }; const int kValidSchemeMasks[] = { URLPattern::SCHEME_HTTP, URLPattern::SCHEME_HTTPS, URLPattern::SCHEME_FILE, URLPattern::SCHEME_FTP, URLPattern::SCHEME_CHROMEUI, URLPattern::SCHEME_FILESYSTEM, }; COMPILE_ASSERT(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks), must_keep_these_arrays_in_sync); const char* kParseSuccess = "Success."; const char* kParseErrorMissingSchemeSeparator = "Missing scheme separator."; const char* kParseErrorInvalidScheme = "Invalid scheme."; const char* kParseErrorWrongSchemeType = "Wrong scheme type."; const char* kParseErrorEmptyHost = "Host can not be empty."; const char* kParseErrorInvalidHostWildcard = "Invalid host wildcard."; const char* kParseErrorEmptyPath = "Empty path."; const char* kParseErrorHasColon = "Ports are not supported in URL patterns. ':' may not be used in a host."; // Message explaining each URLPattern::ParseResult. const char* kParseResultMessages[] = { kParseSuccess, kParseErrorMissingSchemeSeparator, kParseErrorInvalidScheme, kParseErrorWrongSchemeType, kParseErrorEmptyHost, kParseErrorInvalidHostWildcard, kParseErrorEmptyPath, kParseErrorHasColon }; COMPILE_ASSERT(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages), must_add_message_for_each_parse_result); const char kPathSeparator[] = "/"; bool IsStandardScheme(const std::string& scheme) { // "*" gets the same treatment as a standard scheme. if (scheme == "*") return true; return url_util::IsStandard(scheme.c_str(), url_parse::Component(0, static_cast<int>(scheme.length()))); } } // namespace URLPattern::URLPattern() : valid_schemes_(SCHEME_NONE), match_all_urls_(false), match_subdomains_(false) {} URLPattern::URLPattern(int valid_schemes) : valid_schemes_(valid_schemes), match_all_urls_(false), match_subdomains_(false) {} URLPattern::URLPattern(int valid_schemes, const std::string& pattern) : valid_schemes_(valid_schemes), match_all_urls_(false), match_subdomains_(false) { // Strict error checking is used, because this constructor is only // appropriate when we know |pattern| is valid. if (PARSE_SUCCESS != Parse(pattern, PARSE_STRICT)) NOTREACHED() << "URLPattern is invalid: " << pattern; } URLPattern::~URLPattern() { } URLPattern::ParseResult URLPattern::Parse(const std::string& pattern, ParseOption strictness) { CHECK(strictness == PARSE_LENIENT || strictness == PARSE_STRICT); // Special case pattern to match every valid URL. if (pattern == kAllUrlsPattern) { match_all_urls_ = true; match_subdomains_ = true; scheme_ = "*"; host_.clear(); SetPath("/*"); return PARSE_SUCCESS; } // Parse out the scheme. size_t scheme_end_pos = pattern.find(chrome::kStandardSchemeSeparator); bool has_standard_scheme_separator = true; // Some urls also use ':' alone as the scheme separator. if (scheme_end_pos == std::string::npos) { scheme_end_pos = pattern.find(':'); has_standard_scheme_separator = false; } if (scheme_end_pos == std::string::npos) return PARSE_ERROR_MISSING_SCHEME_SEPARATOR; if (!SetScheme(pattern.substr(0, scheme_end_pos))) return PARSE_ERROR_INVALID_SCHEME; bool standard_scheme = IsStandardScheme(scheme_); if (standard_scheme != has_standard_scheme_separator) return PARSE_ERROR_WRONG_SCHEME_SEPARATOR; // Advance past the scheme separator. scheme_end_pos += (standard_scheme ? strlen(chrome::kStandardSchemeSeparator) : 1); if (scheme_end_pos >= pattern.size()) return PARSE_ERROR_EMPTY_HOST; // Parse out the host and path. size_t host_start_pos = scheme_end_pos; size_t path_start_pos = 0; // File URLs are special because they have no host. if (scheme_ == chrome::kFileScheme || !standard_scheme) { path_start_pos = host_start_pos; } else { size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos); // Host is required. if (host_start_pos == host_end_pos) return PARSE_ERROR_EMPTY_HOST; if (host_end_pos == std::string::npos) return PARSE_ERROR_EMPTY_PATH; host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos); // The first component can optionally be '*' to match all subdomains. std::vector<std::string> host_components; base::SplitString(host_, '.', &host_components); if (host_components[0] == "*") { match_subdomains_ = true; host_components.erase(host_components.begin(), host_components.begin() + 1); } host_ = JoinString(host_components, '.'); // No other '*' can occur in the host, though. This isn't necessary, but is // done as a convenience to developers who might otherwise be confused and // think '*' works as a glob in the host. if (host_.find('*') != std::string::npos) return PARSE_ERROR_INVALID_HOST_WILDCARD; path_start_pos = host_end_pos; } SetPath(pattern.substr(path_start_pos)); if (strictness == PARSE_STRICT && host_.find(':') != std::string::npos) return PARSE_ERROR_HAS_COLON; return PARSE_SUCCESS; } bool URLPattern::SetScheme(const std::string& scheme) { scheme_ = scheme; if (scheme_ == "*") { valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS); } else if (!IsValidScheme(scheme_)) { return false; } return true; } bool URLPattern::IsValidScheme(const std::string& scheme) const { if (valid_schemes_ == SCHEME_ALL) return true; for (size_t i = 0; i < arraysize(kValidSchemes); ++i) { if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i])) return true; } return false; } void URLPattern::SetPath(const std::string& path) { path_ = path; path_escaped_ = path_; ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\"); ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?"); } bool URLPattern::MatchesUrl(const GURL &test) const { if (!MatchesScheme(test.scheme())) return false; if (match_all_urls_) return true; if (!MatchesHost(test)) return false; if (!MatchesPath(test.PathForRequest())) return false; return true; } bool URLPattern::MatchesScheme(const std::string& test) const { if (!IsValidScheme(test)) return false; return scheme_ == "*" || test == scheme_; } bool URLPattern::MatchesHost(const std::string& host) const { std::string test(chrome::kHttpScheme); test += chrome::kStandardSchemeSeparator; test += host; test += "/"; return MatchesHost(GURL(test)); } bool URLPattern::MatchesHost(const GURL& test) const { // If the hosts are exactly equal, we have a match. if (test.host() == host_) return true; // If we're matching subdomains, and we have no host in the match pattern, // that means that we're matching all hosts, which means we have a match no // matter what the test host is. if (match_subdomains_ && host_.empty()) return true; // Otherwise, we can only match if our match pattern matches subdomains. if (!match_subdomains_) return false; // We don't do subdomain matching against IP addresses, so we can give up now // if the test host is an IP address. if (test.HostIsIPAddress()) return false; // Check if the test host is a subdomain of our host. if (test.host().length() <= (host_.length() + 1)) return false; if (test.host().compare(test.host().length() - host_.length(), host_.length(), host_) != 0) return false; return test.host()[test.host().length() - host_.length() - 1] == '.'; } bool URLPattern::MatchesPath(const std::string& test) const { if (!MatchPattern(test, path_escaped_)) return false; return true; } std::string URLPattern::GetAsString() const { if (match_all_urls_) return kAllUrlsPattern; bool standard_scheme = IsStandardScheme(scheme_); std::string spec = scheme_ + (standard_scheme ? chrome::kStandardSchemeSeparator : ":"); if (scheme_ != chrome::kFileScheme && standard_scheme) { if (match_subdomains_) { spec += "*"; if (!host_.empty()) spec += "."; } if (!host_.empty()) spec += host_; } if (!path_.empty()) spec += path_; return spec; } bool URLPattern::OverlapsWith(const URLPattern& other) const { if (!MatchesScheme(other.scheme_) && !other.MatchesScheme(scheme_)) return false; if (!MatchesHost(other.host()) && !other.MatchesHost(host_)) return false; // We currently only use OverlapsWith() for the patterns inside // ExtensionExtent. In those cases, we know that the path will have only a // single wildcard at the end. This makes figuring out overlap much easier. It // seems like there is probably a computer-sciency way to solve the general // case, but we don't need that yet. DCHECK(path_.find('*') == path_.size() - 1); DCHECK(other.path().find('*') == other.path().size() - 1); if (!MatchesPath(other.path().substr(0, other.path().size() - 1)) && !other.MatchesPath(path_.substr(0, path_.size() - 1))) return false; return true; } std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const { std::vector<URLPattern> result; if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) { result.push_back(*this); return result; } for (size_t i = 0; i < arraysize(kValidSchemes); ++i) { if (MatchesScheme(kValidSchemes[i])) { URLPattern temp = *this; temp.SetScheme(kValidSchemes[i]); temp.set_match_all_urls(false); result.push_back(temp); } } return result; } // static const char* URLPattern::GetParseResultString( URLPattern::ParseResult parse_result) { return kParseResultMessages[parse_result]; }