// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "net/base/net_util.h" #include <map> #include <vector> #include "base/i18n/time_formatting.h" #include "base/json/string_escape.h" #include "base/lazy_instance.h" #include "base/logging.h" #include "base/memory/singleton.h" #include "base/stl_util.h" #include "base/strings/string_tokenizer.h" #include "base/strings/string_util.h" #include "base/strings/utf_offset_string_conversions.h" #include "base/strings/utf_string_conversions.h" #include "base/time/time.h" #include "url/gurl.h" #include "third_party/icu/source/common/unicode/uidna.h" #include "third_party/icu/source/common/unicode/uniset.h" #include "third_party/icu/source/common/unicode/uscript.h" #include "third_party/icu/source/common/unicode/uset.h" #include "third_party/icu/source/i18n/unicode/datefmt.h" #include "third_party/icu/source/i18n/unicode/regex.h" #include "third_party/icu/source/i18n/unicode/ulocdata.h" using base::Time; namespace net { namespace { typedef std::vector<size_t> Offsets; // Does some simple normalization of scripts so we can allow certain scripts // to exist together. // TODO(brettw) bug 880223: we should allow some other languages to be // oombined such as Chinese and Latin. We will probably need a more // complicated system of language pairs to have more fine-grained control. UScriptCode NormalizeScript(UScriptCode code) { switch (code) { case USCRIPT_KATAKANA: case USCRIPT_HIRAGANA: case USCRIPT_KATAKANA_OR_HIRAGANA: case USCRIPT_HANGUL: // This one is arguable. return USCRIPT_HAN; default: return code; } } bool IsIDNComponentInSingleScript(const base::char16* str, int str_len) { UScriptCode first_script = USCRIPT_INVALID_CODE; bool is_first = true; int i = 0; while (i < str_len) { unsigned code_point; U16_NEXT(str, i, str_len, code_point); UErrorCode err = U_ZERO_ERROR; UScriptCode cur_script = uscript_getScript(code_point, &err); if (err != U_ZERO_ERROR) return false; // Report mixed on error. cur_script = NormalizeScript(cur_script); // TODO(brettw) We may have to check for USCRIPT_INHERENT as well. if (is_first && cur_script != USCRIPT_COMMON) { first_script = cur_script; is_first = false; } else { if (cur_script != USCRIPT_COMMON && cur_script != first_script) return false; } } return true; } // Check if the script of a language can be 'safely' mixed with // Latin letters in the ASCII range. bool IsCompatibleWithASCIILetters(const std::string& lang) { // For now, just list Chinese, Japanese and Korean (positive list). // An alternative is negative-listing (languages using Greek and // Cyrillic letters), but it can be more dangerous. return !lang.substr(0, 2).compare("zh") || !lang.substr(0, 2).compare("ja") || !lang.substr(0, 2).compare("ko"); } typedef std::map<std::string, icu::UnicodeSet*> LangToExemplarSetMap; class LangToExemplarSet { public: static LangToExemplarSet* GetInstance() { return Singleton<LangToExemplarSet>::get(); } private: LangToExemplarSetMap map; LangToExemplarSet() { } ~LangToExemplarSet() { STLDeleteContainerPairSecondPointers(map.begin(), map.end()); } friend class Singleton<LangToExemplarSet>; friend struct DefaultSingletonTraits<LangToExemplarSet>; friend bool GetExemplarSetForLang(const std::string&, icu::UnicodeSet**); friend void SetExemplarSetForLang(const std::string&, icu::UnicodeSet*); DISALLOW_COPY_AND_ASSIGN(LangToExemplarSet); }; bool GetExemplarSetForLang(const std::string& lang, icu::UnicodeSet** lang_set) { const LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map; LangToExemplarSetMap::const_iterator pos = map.find(lang); if (pos != map.end()) { *lang_set = pos->second; return true; } return false; } void SetExemplarSetForLang(const std::string& lang, icu::UnicodeSet* lang_set) { LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map; map.insert(std::make_pair(lang, lang_set)); } static base::LazyInstance<base::Lock>::Leaky g_lang_set_lock = LAZY_INSTANCE_INITIALIZER; // Returns true if all the characters in component_characters are used by // the language |lang|. bool IsComponentCoveredByLang(const icu::UnicodeSet& component_characters, const std::string& lang) { CR_DEFINE_STATIC_LOCAL( const icu::UnicodeSet, kASCIILetters, ('a', 'z')); icu::UnicodeSet* lang_set = NULL; // We're called from both the UI thread and the history thread. { base::AutoLock lock(g_lang_set_lock.Get()); if (!GetExemplarSetForLang(lang, &lang_set)) { UErrorCode status = U_ZERO_ERROR; ULocaleData* uld = ulocdata_open(lang.c_str(), &status); // TODO(jungshik) Turn this check on when the ICU data file is // rebuilt with the minimal subset of locale data for languages // to which Chrome is not localized but which we offer in the list // of languages selectable for Accept-Languages. With the rebuilt ICU // data, ulocdata_open never should fall back to the default locale. // (issue 2078) // DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING); if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) { lang_set = reinterpret_cast<icu::UnicodeSet *>( ulocdata_getExemplarSet(uld, NULL, 0, ULOCDATA_ES_STANDARD, &status)); // If |lang| is compatible with ASCII Latin letters, add them. if (IsCompatibleWithASCIILetters(lang)) lang_set->addAll(kASCIILetters); } else { lang_set = new icu::UnicodeSet(1, 0); } lang_set->freeze(); SetExemplarSetForLang(lang, lang_set); ulocdata_close(uld); } } return !lang_set->isEmpty() && lang_set->containsAll(component_characters); } // Returns true if the given Unicode host component is safe to display to the // user. bool IsIDNComponentSafe(const base::char16* str, int str_len, const std::string& languages) { // Most common cases (non-IDN) do not reach here so that we don't // need a fast return path. // TODO(jungshik) : Check if there's any character inappropriate // (although allowed) for domain names. // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and // http://www.unicode.org/reports/tr39/data/xidmodifications.txt // For now, we borrow the list from Mozilla and tweaked it slightly. // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because // they're gonna be canonicalized to U+0020 and full stop before // reaching here.) // The original list is available at // http://kb.mozillazine.org/Network.IDN.blacklist_chars and // at http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js#703 UErrorCode status = U_ZERO_ERROR; #ifdef U_WCHAR_IS_UTF16 icu::UnicodeSet dangerous_characters(icu::UnicodeString( L"[[\\ \u00ad\u00bc\u00bd\u01c3\u0337\u0338" L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]" L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]" L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae" L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014" L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14" L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]" L"[\ufffa-\ufffd]]"), status); DCHECK(U_SUCCESS(status)); icu::RegexMatcher dangerous_patterns(icu::UnicodeString( // Lone katakana no, so, or n L"[^\\p{Katakana}][\u30ce\u30f3\u30bd][^\\p{Katakana}]" // Repeating Japanese accent characters L"|[\u3099\u309a\u309b\u309c][\u3099\u309a\u309b\u309c]"), 0, status); #else icu::UnicodeSet dangerous_characters(icu::UnicodeString( "[[\\u0020\\u00ad\\u00bc\\u00bd\\u01c3\\u0337\\u0338" "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]" "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]" "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae" "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014" "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14" "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]" "[\\ufffa-\\ufffd]]", -1, US_INV), status); DCHECK(U_SUCCESS(status)); icu::RegexMatcher dangerous_patterns(icu::UnicodeString( // Lone katakana no, so, or n "[^\\p{Katakana}][\\u30ce\\u30f3\u30bd][^\\p{Katakana}]" // Repeating Japanese accent characters "|[\\u3099\\u309a\\u309b\\u309c][\\u3099\\u309a\\u309b\\u309c]"), 0, status); #endif DCHECK(U_SUCCESS(status)); icu::UnicodeSet component_characters; icu::UnicodeString component_string(str, str_len); component_characters.addAll(component_string); if (dangerous_characters.containsSome(component_characters)) return false; DCHECK(U_SUCCESS(status)); dangerous_patterns.reset(component_string); if (dangerous_patterns.find()) return false; // If the language list is empty, the result is completely determined // by whether a component is a single script or not. This will block // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are // allowed with |languages| (while it blocks Chinese + Latin letters with // an accent as should be the case), but we want to err on the safe side // when |languages| is empty. if (languages.empty()) return IsIDNComponentInSingleScript(str, str_len); // |common_characters| is made up of ASCII numbers, hyphen, plus and // underscore that are used across scripts and allowed in domain names. // (sync'd with characters allowed in url_canon_host with square // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc. icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"), status); DCHECK(U_SUCCESS(status)); // Subtract common characters because they're always allowed so that // we just have to check if a language-specific set contains // the remainder. component_characters.removeAll(common_characters); base::StringTokenizer t(languages, ","); while (t.GetNext()) { if (IsComponentCoveredByLang(component_characters, t.token())) return true; } return false; } // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to // a UTS46/IDNA 2008 handling object opened with uidna_openUTS46(). // // We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with // the backward compatibility in mind. What it does: // // 1. Use the up-to-date Unicode data. // 2. Define a case folding/mapping with the up-to-date Unicode data as // in IDNA 2003. // 3. Use transitional mechanism for 4 deviation characters (sharp-s, // final sigma, ZWJ and ZWNJ) for now. // 4. Continue to allow symbols and punctuations. // 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules. // 6. Do not apply STD3 rules // 7. Do not allow unassigned code points. // // It also closely matches what IE 10 does except for the BiDi check ( // http://goo.gl/3XBhqw ). // See http://http://unicode.org/reports/tr46/ and references therein // for more details. struct UIDNAWrapper { UIDNAWrapper() { UErrorCode err = U_ZERO_ERROR; // TODO(jungshik): Change options as different parties (browsers, // registrars, search engines) converge toward a consensus. value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err); if (U_FAILURE(err)) value = NULL; } UIDNA* value; }; static base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER; // Converts one component of a host (between dots) to IDN if safe. The result // will be APPENDED to the given output string and will be the same as the input // if it is not IDN or the IDN is unsafe to display. Returns whether any // conversion was performed. bool IDNToUnicodeOneComponent(const base::char16* comp, size_t comp_len, const std::string& languages, base::string16* out) { DCHECK(out); if (comp_len == 0) return false; // Only transform if the input can be an IDN component. static const base::char16 kIdnPrefix[] = {'x', 'n', '-', '-'}; if ((comp_len > arraysize(kIdnPrefix)) && !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(base::char16))) { UIDNA* uidna = g_uidna.Get().value; DCHECK(uidna != NULL); size_t original_length = out->length(); int output_length = 64; UIDNAInfo info = UIDNA_INFO_INITIALIZER; UErrorCode status; do { out->resize(original_length + output_length); status = U_ZERO_ERROR; // This returns the actual length required. If this is more than 64 // code units, |status| will be U_BUFFER_OVERFLOW_ERROR and we'll try // the conversion again, but with a sufficiently large buffer. output_length = uidna_labelToUnicode( uidna, comp, static_cast<int32_t>(comp_len), &(*out)[original_length], output_length, &info, &status); } while ((status == U_BUFFER_OVERFLOW_ERROR && info.errors == 0)); if (U_SUCCESS(status) && info.errors == 0) { // Converted successfully. Ensure that the converted component // can be safely displayed to the user. out->resize(original_length + output_length); if (IsIDNComponentSafe(out->data() + original_length, output_length, languages)) return true; } // Something went wrong. Revert to original string. out->resize(original_length); } // We get here with no IDN or on error, in which case we just append the // literal input. out->append(comp, comp_len); return false; } // TODO(brettw) bug 734373: check the scripts for each host component and // don't un-IDN-ize if there is more than one. Alternatively, only IDN for // scripts that the user has installed. For now, just put the entire // path through IDN. Maybe this feature can be implemented in ICU itself? // // We may want to skip this step in the case of file URLs to allow unicode // UNC hostnames regardless of encodings. base::string16 IDNToUnicodeWithAdjustments( const std::string& host, const std::string& languages, base::OffsetAdjuster::Adjustments* adjustments) { if (adjustments) adjustments->clear(); // Convert the ASCII input to a base::string16 for ICU. base::string16 input16; input16.reserve(host.length()); input16.insert(input16.end(), host.begin(), host.end()); // Do each component of the host separately, since we enforce script matching // on a per-component basis. base::string16 out16; { for (size_t component_start = 0, component_end; component_start < input16.length(); component_start = component_end + 1) { // Find the end of the component. component_end = input16.find('.', component_start); if (component_end == base::string16::npos) component_end = input16.length(); // For getting the last component. size_t component_length = component_end - component_start; size_t new_component_start = out16.length(); bool converted_idn = false; if (component_end > component_start) { // Add the substring that we just found. converted_idn = IDNToUnicodeOneComponent( input16.data() + component_start, component_length, languages, &out16); } size_t new_component_length = out16.length() - new_component_start; if (converted_idn && adjustments) { adjustments->push_back(base::OffsetAdjuster::Adjustment( component_start, component_length, new_component_length)); } // Need to add the dot we just found (if we found one). if (component_end < input16.length()) out16.push_back('.'); } } return out16; } // If |component| is valid, its begin is incremented by |delta|. void AdjustComponent(int delta, url::Component* component) { if (!component->is_valid()) return; DCHECK(delta >= 0 || component->begin >= -delta); component->begin += delta; } // Adjusts all the components of |parsed| by |delta|, except for the scheme. void AdjustAllComponentsButScheme(int delta, url::Parsed* parsed) { AdjustComponent(delta, &(parsed->username)); AdjustComponent(delta, &(parsed->password)); AdjustComponent(delta, &(parsed->host)); AdjustComponent(delta, &(parsed->port)); AdjustComponent(delta, &(parsed->path)); AdjustComponent(delta, &(parsed->query)); AdjustComponent(delta, &(parsed->ref)); } // Helper for FormatUrlWithOffsets(). base::string16 FormatViewSourceUrl( const GURL& url, const std::string& languages, FormatUrlTypes format_types, UnescapeRule::Type unescape_rules, url::Parsed* new_parsed, size_t* prefix_end, base::OffsetAdjuster::Adjustments* adjustments) { DCHECK(new_parsed); const char kViewSource[] = "view-source:"; const size_t kViewSourceLength = arraysize(kViewSource) - 1; // Format the underlying URL and record adjustments. const std::string& url_str(url.possibly_invalid_spec()); adjustments->clear(); base::string16 result(base::ASCIIToUTF16(kViewSource) + FormatUrlWithAdjustments(GURL(url_str.substr(kViewSourceLength)), languages, format_types, unescape_rules, new_parsed, prefix_end, adjustments)); // Revise |adjustments| by shifting to the offsets to prefix that the above // call to FormatUrl didn't get to see. for (base::OffsetAdjuster::Adjustments::iterator it = adjustments->begin(); it != adjustments->end(); ++it) it->original_offset += kViewSourceLength; // Adjust positions of the parsed components. if (new_parsed->scheme.is_nonempty()) { // Assume "view-source:real-scheme" as a scheme. new_parsed->scheme.len += kViewSourceLength; } else { new_parsed->scheme.begin = 0; new_parsed->scheme.len = kViewSourceLength - 1; } AdjustAllComponentsButScheme(kViewSourceLength, new_parsed); if (prefix_end) *prefix_end += kViewSourceLength; return result; } class AppendComponentTransform { public: AppendComponentTransform() {} virtual ~AppendComponentTransform() {} virtual base::string16 Execute( const std::string& component_text, base::OffsetAdjuster::Adjustments* adjustments) const = 0; // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an // accessible copy constructor in order to call AppendFormattedComponent() // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ). }; class HostComponentTransform : public AppendComponentTransform { public: explicit HostComponentTransform(const std::string& languages) : languages_(languages) { } private: virtual base::string16 Execute( const std::string& component_text, base::OffsetAdjuster::Adjustments* adjustments) const OVERRIDE { return IDNToUnicodeWithAdjustments(component_text, languages_, adjustments); } const std::string& languages_; }; class NonHostComponentTransform : public AppendComponentTransform { public: explicit NonHostComponentTransform(UnescapeRule::Type unescape_rules) : unescape_rules_(unescape_rules) { } private: virtual base::string16 Execute( const std::string& component_text, base::OffsetAdjuster::Adjustments* adjustments) const OVERRIDE { return (unescape_rules_ == UnescapeRule::NONE) ? base::UTF8ToUTF16WithAdjustments(component_text, adjustments) : UnescapeAndDecodeUTF8URLComponentWithAdjustments(component_text, unescape_rules_, adjustments); } const UnescapeRule::Type unescape_rules_; }; // Transforms the portion of |spec| covered by |original_component| according to // |transform|. Appends the result to |output|. If |output_component| is // non-NULL, its start and length are set to the transformed component's new // start and length. If |adjustments| is non-NULL, appends adjustments (if // any) that reflect the transformation the original component underwent to // become the transformed value appended to |output|. void AppendFormattedComponent(const std::string& spec, const url::Component& original_component, const AppendComponentTransform& transform, base::string16* output, url::Component* output_component, base::OffsetAdjuster::Adjustments* adjustments) { DCHECK(output); if (original_component.is_nonempty()) { size_t original_component_begin = static_cast<size_t>(original_component.begin); size_t output_component_begin = output->length(); std::string component_str(spec, original_component_begin, static_cast<size_t>(original_component.len)); // Transform |component_str| and modify |adjustments| appropriately. base::OffsetAdjuster::Adjustments component_transform_adjustments; output->append( transform.Execute(component_str, &component_transform_adjustments)); // Shift all the adjustments made for this component so the offsets are // valid for the original string and add them to |adjustments|. for (base::OffsetAdjuster::Adjustments::iterator comp_iter = component_transform_adjustments.begin(); comp_iter != component_transform_adjustments.end(); ++comp_iter) comp_iter->original_offset += original_component_begin; if (adjustments) { adjustments->insert(adjustments->end(), component_transform_adjustments.begin(), component_transform_adjustments.end()); } // Set positions of the parsed component. if (output_component) { output_component->begin = static_cast<int>(output_component_begin); output_component->len = static_cast<int>(output->length() - output_component_begin); } } else if (output_component) { output_component->reset(); } } } // namespace const FormatUrlType kFormatUrlOmitNothing = 0; const FormatUrlType kFormatUrlOmitUsernamePassword = 1 << 0; const FormatUrlType kFormatUrlOmitHTTP = 1 << 1; const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2; const FormatUrlType kFormatUrlOmitAll = kFormatUrlOmitUsernamePassword | kFormatUrlOmitHTTP | kFormatUrlOmitTrailingSlashOnBareHostname; base::string16 IDNToUnicode(const std::string& host, const std::string& languages) { return IDNToUnicodeWithAdjustments(host, languages, NULL); } std::string GetDirectoryListingEntry(const base::string16& name, const std::string& raw_bytes, bool is_dir, int64 size, Time modified) { std::string result; result.append("<script>addRow("); base::EscapeJSONString(name, true, &result); result.append(","); if (raw_bytes.empty()) { base::EscapeJSONString(EscapePath(base::UTF16ToUTF8(name)), true, &result); } else { base::EscapeJSONString(EscapePath(raw_bytes), true, &result); } if (is_dir) { result.append(",1,"); } else { result.append(",0,"); } // Negative size means unknown or not applicable (e.g. directory). base::string16 size_string; if (size >= 0) size_string = FormatBytesUnlocalized(size); base::EscapeJSONString(size_string, true, &result); result.append(","); base::string16 modified_str; // |modified| can be NULL in FTP listings. if (!modified.is_null()) { modified_str = base::TimeFormatShortDateAndTime(modified); } base::EscapeJSONString(modified_str, true, &result); result.append(");</script>\n"); return result; } void AppendFormattedHost(const GURL& url, const std::string& languages, base::string16* output) { AppendFormattedComponent(url.possibly_invalid_spec(), url.parsed_for_possibly_invalid_spec().host, HostComponentTransform(languages), output, NULL, NULL); } base::string16 FormatUrlWithOffsets( const GURL& url, const std::string& languages, FormatUrlTypes format_types, UnescapeRule::Type unescape_rules, url::Parsed* new_parsed, size_t* prefix_end, std::vector<size_t>* offsets_for_adjustment) { base::OffsetAdjuster::Adjustments adjustments; const base::string16& format_url_return_value = FormatUrlWithAdjustments(url, languages, format_types, unescape_rules, new_parsed, prefix_end, &adjustments); base::OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); if (offsets_for_adjustment) { std::for_each( offsets_for_adjustment->begin(), offsets_for_adjustment->end(), base::LimitOffset<std::string>(format_url_return_value.length())); } return format_url_return_value; } base::string16 FormatUrlWithAdjustments( const GURL& url, const std::string& languages, FormatUrlTypes format_types, UnescapeRule::Type unescape_rules, url::Parsed* new_parsed, size_t* prefix_end, base::OffsetAdjuster::Adjustments* adjustments) { DCHECK(adjustments != NULL); adjustments->clear(); url::Parsed parsed_temp; if (!new_parsed) new_parsed = &parsed_temp; else *new_parsed = url::Parsed(); // Special handling for view-source:. Don't use content::kViewSourceScheme // because this library shouldn't depend on chrome. const char* const kViewSource = "view-source"; // Reject "view-source:view-source:..." to avoid deep recursion. const char* const kViewSourceTwice = "view-source:view-source:"; if (url.SchemeIs(kViewSource) && !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) { return FormatViewSourceUrl(url, languages, format_types, unescape_rules, new_parsed, prefix_end, adjustments); } // We handle both valid and invalid URLs (this will give us the spec // regardless of validity). const std::string& spec = url.possibly_invalid_spec(); const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec(); // Scheme & separators. These are ASCII. base::string16 url_string; url_string.insert( url_string.end(), spec.begin(), spec.begin() + parsed.CountCharactersBefore(url::Parsed::USERNAME, true)); const char kHTTP[] = "http://"; const char kFTP[] = "ftp."; // url_fixer::FixupURL() treats "ftp.foo.com" as ftp://ftp.foo.com. This // means that if we trim "http://" off a URL whose host starts with "ftp." and // the user inputs this into any field subject to fixup (which is basically // all input fields), the meaning would be changed. (In fact, often the // formatted URL is directly pre-filled into an input field.) For this reason // we avoid stripping "http://" in this case. bool omit_http = (format_types & kFormatUrlOmitHTTP) && EqualsASCII(url_string, kHTTP) && !StartsWithASCII(url.host(), kFTP, true); new_parsed->scheme = parsed.scheme; // Username & password. if ((format_types & kFormatUrlOmitUsernamePassword) != 0) { // Remove the username and password fields. We don't want to display those // to the user since they can be used for attacks, // e.g. "http://google.com:search@evil.ru/" new_parsed->username.reset(); new_parsed->password.reset(); // Update the adjustments based on removed username and/or password. if (parsed.username.is_nonempty() || parsed.password.is_nonempty()) { if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) { // The seeming off-by-two is to account for the ':' after the username // and '@' after the password. adjustments->push_back(base::OffsetAdjuster::Adjustment( static_cast<size_t>(parsed.username.begin), static_cast<size_t>(parsed.username.len + parsed.password.len + 2), 0)); } else { const url::Component* nonempty_component = parsed.username.is_nonempty() ? &parsed.username : &parsed.password; // The seeming off-by-one is to account for the '@' after the // username/password. adjustments->push_back(base::OffsetAdjuster::Adjustment( static_cast<size_t>(nonempty_component->begin), static_cast<size_t>(nonempty_component->len + 1), 0)); } } } else { AppendFormattedComponent(spec, parsed.username, NonHostComponentTransform(unescape_rules), &url_string, &new_parsed->username, adjustments); if (parsed.password.is_valid()) url_string.push_back(':'); AppendFormattedComponent(spec, parsed.password, NonHostComponentTransform(unescape_rules), &url_string, &new_parsed->password, adjustments); if (parsed.username.is_valid() || parsed.password.is_valid()) url_string.push_back('@'); } if (prefix_end) *prefix_end = static_cast<size_t>(url_string.length()); // Host. AppendFormattedComponent(spec, parsed.host, HostComponentTransform(languages), &url_string, &new_parsed->host, adjustments); // Port. if (parsed.port.is_nonempty()) { url_string.push_back(':'); new_parsed->port.begin = url_string.length(); url_string.insert(url_string.end(), spec.begin() + parsed.port.begin, spec.begin() + parsed.port.end()); new_parsed->port.len = url_string.length() - new_parsed->port.begin; } else { new_parsed->port.reset(); } // Path & query. Both get the same general unescape & convert treatment. if (!(format_types & kFormatUrlOmitTrailingSlashOnBareHostname) || !CanStripTrailingSlash(url)) { AppendFormattedComponent(spec, parsed.path, NonHostComponentTransform(unescape_rules), &url_string, &new_parsed->path, adjustments); } else { if (parsed.path.len > 0) { adjustments->push_back(base::OffsetAdjuster::Adjustment( parsed.path.begin, parsed.path.len, 0)); } } if (parsed.query.is_valid()) url_string.push_back('?'); AppendFormattedComponent(spec, parsed.query, NonHostComponentTransform(unescape_rules), &url_string, &new_parsed->query, adjustments); // Ref. This is valid, unescaped UTF-8, so we can just convert. if (parsed.ref.is_valid()) url_string.push_back('#'); AppendFormattedComponent(spec, parsed.ref, NonHostComponentTransform(UnescapeRule::NONE), &url_string, &new_parsed->ref, adjustments); // If we need to strip out http do it after the fact. if (omit_http && StartsWith(url_string, base::ASCIIToUTF16(kHTTP), true)) { const size_t kHTTPSize = arraysize(kHTTP) - 1; url_string = url_string.substr(kHTTPSize); // Because offsets in the |adjustments| are already calculated with respect // to the string with the http:// prefix in it, those offsets remain correct // after stripping the prefix. The only thing necessary is to add an // adjustment to reflect the stripped prefix. adjustments->insert(adjustments->begin(), base::OffsetAdjuster::Adjustment(0, kHTTPSize, 0)); if (prefix_end) *prefix_end -= kHTTPSize; // Adjust new_parsed. DCHECK(new_parsed->scheme.is_valid()); int delta = -(new_parsed->scheme.len + 3); // +3 for ://. new_parsed->scheme.reset(); AdjustAllComponentsButScheme(delta, new_parsed); } return url_string; } base::string16 FormatUrl(const GURL& url, const std::string& languages, FormatUrlTypes format_types, UnescapeRule::Type unescape_rules, url::Parsed* new_parsed, size_t* prefix_end, size_t* offset_for_adjustment) { Offsets offsets; if (offset_for_adjustment) offsets.push_back(*offset_for_adjustment); base::string16 result = FormatUrlWithOffsets(url, languages, format_types, unescape_rules, new_parsed, prefix_end, &offsets); if (offset_for_adjustment) *offset_for_adjustment = offsets[0]; return result; } } // namespace net