普通文本  |  588行  |  18.6 KB

// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/browser/search_engines/template_url_parser.h"

#include <algorithm>
#include <map>
#include <vector>

#include "base/logging.h"
#include "base/memory/scoped_ptr.h"
#include "base/string_number_conversions.h"
#include "base/string_util.h"
#include "base/utf_string_conversions.h"
#include "chrome/browser/search_engines/template_url.h"
#include "chrome/common/url_constants.h"
#include "googleurl/src/gurl.h"
#include "libxml/parser.h"
#include "libxml/xmlwriter.h"

namespace {

//
// NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
// to that of char, the following names are all in terms of char. This avoids
// having to convert to wide, then do comparisons

// Defines for element names of the OSD document:
static const char kURLElement[] = "Url";
static const char kParamElement[] = "Param";
static const char kShortNameElement[] = "ShortName";
static const char kDescriptionElement[] = "Description";
static const char kImageElement[] = "Image";
static const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
static const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
static const char kLanguageElement[] = "Language";
static const char kInputEncodingElement[] = "InputEncoding";

// Various XML attributes used.
static const char kURLTypeAttribute[] = "type";
static const char kURLTemplateAttribute[] = "template";
static const char kImageTypeAttribute[] = "type";
static const char kImageWidthAttribute[] = "width";
static const char kImageHeightAttribute[] = "height";
static const char kURLIndexOffsetAttribute[] = "indexOffset";
static const char kURLPageOffsetAttribute[] = "pageOffset";
static const char kParamNameAttribute[] = "name";
static const char kParamValueAttribute[] = "value";
static const char kParamMethodAttribute[] = "method";

// Mime type for search results.
static const char kHTMLType[] = "text/html";

// Mime type for as you type suggestions.
static const char kSuggestionType[] = "application/x-suggestions+json";

// Namespace identifier.
static const char kOSDNS[] = "xmlns";

// The namespace for documents we understand.
static const char kNameSpace[] = "http://a9.com/-/spec/opensearch/1.1/";

// Removes the namespace from the specified |name|, ex: os:Url -> Url.
static void PruneNamespace(std::string* name) {
  size_t index = name->find_first_of(":");
  if (index != std::string::npos)
    name->erase(0, index + 1);
}

//
// To minimize memory overhead while parsing, a SAX style parser is used.
// ParsingContext is used to maintain the state we're in the document
// while parsing.
class ParsingContext {
 public:
  // Enum of the known element types.
  enum ElementType {
    UNKNOWN,
    OPEN_SEARCH_DESCRIPTION,
    URL,
    PARAM,
    SHORT_NAME,
    DESCRIPTION,
    IMAGE,
    LANGUAGE,
    INPUT_ENCODING,
  };

  enum Method {
    GET,
    POST
  };

  // Key/value of a Param node.
  typedef std::pair<std::string, std::string> Param;

  ParsingContext(TemplateURLParser::ParameterFilter* parameter_filter,
                 TemplateURL* url)
      : url_(url),
        parameter_filter_(parameter_filter),
        method_(GET),
        suggestion_method_(GET),
        is_suggest_url_(false),
        derive_image_from_url_(false) {
    if (kElementNameToElementTypeMap == NULL)
      InitMapping();
  }

  // Invoked when an element starts.
  void PushElement(const std::string& element) {
    ElementType type;
    if (kElementNameToElementTypeMap->find(element) ==
        kElementNameToElementTypeMap->end()) {
      type = UNKNOWN;
    } else {
      type = (*kElementNameToElementTypeMap)[element];
    }
    elements_.push_back(type);
  }

  void PopElement() {
    elements_.pop_back();
  }

  // Returns the current ElementType.
  ElementType GetKnownType() {
    if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
      return elements_[1];

    // We only expect PARAM nodes under the Url node
    if (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
        elements_[1] == URL && elements_[2] == PARAM)
      return PARAM;

    return UNKNOWN;
  }

  TemplateURL* template_url() { return url_; }

  void AddImageRef(const std::string& type, int width, int height) {
    if (width > 0 && height > 0)
      current_image_.reset(new TemplateURL::ImageRef(type, width, height));
  }

  void EndImage() {
    current_image_.reset();
  }

  void SetImageURL(const GURL& url) {
    if (current_image_.get()) {
      current_image_->url = url;
      url_->add_image_ref(*current_image_);
      current_image_.reset();
    }
  }

  void ResetString() {
    string_.clear();
  }

  void AppendString(const string16& string) {
    string_ += string;
  }

  const string16& GetString() {
    return string_;
  }

  void ResetExtraParams() {
    extra_params_.clear();
  }

  void AddExtraParams(const std::string& key, const std::string& value) {
    if (parameter_filter_ && !parameter_filter_->KeepParameter(key, value))
      return;
    extra_params_.push_back(Param(key, value));
  }

  const std::vector<Param>& extra_params() const { return extra_params_; }

  void set_is_suggestion(bool value) { is_suggest_url_ = value; }
  bool is_suggestion() const { return is_suggest_url_; }

  TemplateURLParser::ParameterFilter* parameter_filter() const {
    return parameter_filter_;
  }

  void set_derive_image_from_url(bool derive_image_from_url) {
    derive_image_from_url_ = derive_image_from_url;
  }

  void set_method(Method method) { method_ = method; }
  Method method() { return method_; }

  void set_suggestion_method(Method method) { suggestion_method_ = method; }
  Method suggestion_method() { return suggestion_method_; }

  // Builds the image URL from the Template search URL if no image URL has been
  // set.
  void DeriveImageFromURL() {
    if (derive_image_from_url_ &&
        url_->GetFaviconURL().is_empty() && url_->url()) {
      GURL url(url_->url()->url());  // More url's please...
      url_->SetFaviconURL(TemplateURL::GenerateFaviconURL(url));
    }
  }

 private:
  static void InitMapping() {
    kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
    (*kElementNameToElementTypeMap)[kURLElement] = URL;
    (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
    (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
    (*kElementNameToElementTypeMap)[kDescriptionElement] = DESCRIPTION;
    (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
    (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
        OPEN_SEARCH_DESCRIPTION;
    (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
        OPEN_SEARCH_DESCRIPTION;
    (*kElementNameToElementTypeMap)[kLanguageElement] =
        LANGUAGE;
    (*kElementNameToElementTypeMap)[kInputEncodingElement] =
        INPUT_ENCODING;
  }

  // Key is UTF8 encoded.
  static std::map<std::string, ElementType>* kElementNameToElementTypeMap;
  // TemplateURL supplied to Read method. It's owned by the caller, so we
  // don't need to free it.
  TemplateURL* url_;
  std::vector<ElementType> elements_;
  scoped_ptr<TemplateURL::ImageRef> current_image_;

  // Character content for the current element.
  string16 string_;

  TemplateURLParser::ParameterFilter* parameter_filter_;

  // The list of parameters parsed in the Param nodes of a Url node.
  std::vector<Param> extra_params_;

  // The HTTP methods used.
  Method method_;
  Method suggestion_method_;

  // If true, we are currently parsing a suggest URL, otherwise it is an HTML
  // search.  Note that we don't need a stack as Url nodes cannot be nested.
  bool is_suggest_url_;

  // Whether we should derive the image from the URL (when images are data
  // URLs).
  bool derive_image_from_url_;

  DISALLOW_COPY_AND_ASSIGN(ParsingContext);
};

// static
std::map<std::string, ParsingContext::ElementType>*
    ParsingContext::kElementNameToElementTypeMap = NULL;

string16 XMLCharToUTF16(const xmlChar* value, int length) {
  return UTF8ToUTF16(std::string((const char*)value, length));
}

std::string XMLCharToString(const xmlChar* value) {
  return std::string((const char*)value);
}

// Returns true if input_encoding contains a valid input encoding string. This
// doesn't verify that we have a valid encoding for the string, just that the
// string contains characters that constitute a valid input encoding.
bool IsValidEncodingString(const std::string& input_encoding) {
  if (input_encoding.empty())
    return false;

  if (!IsAsciiAlpha(input_encoding[0]))
    return false;

  for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
    char c = input_encoding[i];
    if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
        c != '-') {
      return false;
    }
  }
  return true;
}

void ParseURL(const xmlChar** atts, ParsingContext* context) {
  if (!atts)
    return;

  TemplateURL* turl = context->template_url();
  const xmlChar** attributes = atts;
  std::string template_url;
  bool is_post = false;
  bool is_html_url = false;
  bool is_suggest_url = false;
  int index_offset = 1;
  int page_offset = 1;

  while (*attributes) {
    std::string name(XMLCharToString(*attributes));
    const xmlChar* value = attributes[1];
    if (name == kURLTypeAttribute) {
      std::string type = XMLCharToString(value);
      is_html_url = (type == kHTMLType);
      is_suggest_url = (type == kSuggestionType);
    } else if (name == kURLTemplateAttribute) {
      template_url = XMLCharToString(value);
    } else if (name == kURLIndexOffsetAttribute) {
      base::StringToInt(XMLCharToString(value), &index_offset);
      index_offset = std::max(1, index_offset);
    } else if (name == kURLPageOffsetAttribute) {
      base::StringToInt(XMLCharToString(value), &page_offset);
      page_offset = std::max(1, page_offset);
    } else if (name == kParamMethodAttribute) {
      is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
    }
    attributes += 2;
  }
  if (is_html_url) {
    turl->SetURL(template_url, index_offset, page_offset);
    context->set_is_suggestion(false);
    if (is_post)
      context->set_method(ParsingContext::POST);
  } else if (is_suggest_url) {
    turl->SetSuggestionsURL(template_url, index_offset, page_offset);
    context->set_is_suggestion(true);
    if (is_post)
      context->set_suggestion_method(ParsingContext::POST);
  }
}

void ParseImage(const xmlChar** atts, ParsingContext* context) {
  if (!atts)
    return;

  const xmlChar** attributes = atts;
  int width = 0;
  int height = 0;
  std::string type;
  while (*attributes) {
    std::string name(XMLCharToString(*attributes));
    const xmlChar* value = attributes[1];
    if (name == kImageTypeAttribute) {
      type = XMLCharToString(value);
    } else if (name == kImageWidthAttribute) {
      base::StringToInt(XMLCharToString(value), &width);
    } else if (name == kImageHeightAttribute) {
      base::StringToInt(XMLCharToString(value), &height);
    }
    attributes += 2;
  }
  if (width > 0 && height > 0 && !type.empty()) {
    // Valid Image URL.
    context->AddImageRef(type, width, height);
  }
}

void ParseParam(const xmlChar** atts, ParsingContext* context) {
  if (!atts)
    return;

  const xmlChar** attributes = atts;
  std::string key, value;
  while (*attributes) {
    std::string name(XMLCharToString(*attributes));
    const xmlChar* val = attributes[1];
    if (name == kParamNameAttribute) {
      key = XMLCharToString(val);
    } else if (name == kParamValueAttribute) {
      value = XMLCharToString(val);
    }
    attributes += 2;
  }
  if (!key.empty())
    context->AddExtraParams(key, value);
}

static void AppendParamToQuery(const std::string& key,
                               const std::string& value,
                               std::string* query) {
  if (!query->empty())
    query->append("&");
  if (!key.empty()) {
    query->append(key);
    query->append("=");
  }
  query->append(value);
}

void ProcessURLParams(ParsingContext* context) {
  TemplateURL* t_url = context->template_url();
  const TemplateURLRef* t_url_ref =
      context->is_suggestion() ? t_url->suggestions_url() :
                                 t_url->url();
  if (!t_url_ref)
    return;

  if (!context->parameter_filter() && context->extra_params().empty())
    return;

  GURL url(t_url_ref->url());
  // If there is a parameter filter, parse the existing URL and remove any
  // unwanted parameter.
  TemplateURLParser::ParameterFilter* filter = context->parameter_filter();
  std::string new_query;
  bool modified = false;
  if (filter) {
    url_parse::Component query = url.parsed_for_possibly_invalid_spec().query;
    url_parse::Component key, value;
    const char* url_spec = url.spec().c_str();
    while (url_parse::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
      std::string key_str(url_spec, key.begin, key.len);
      std::string value_str(url_spec, value.begin, value.len);
      if (filter->KeepParameter(key_str, value_str)) {
        AppendParamToQuery(key_str, value_str, &new_query);
      } else {
        modified = true;
      }
    }
  }
  if (!modified)
    new_query = url.query();

  // Add the extra parameters if any.
  const std::vector<ParsingContext::Param>& params = context->extra_params();
  if (!params.empty()) {
    modified = true;
    std::vector<ParsingContext::Param>::const_iterator iter;
    for (iter = params.begin(); iter != params.end(); ++iter)
      AppendParamToQuery(iter->first, iter->second, &new_query);
  }

  if (modified) {
    GURL::Replacements repl;
    repl.SetQueryStr(new_query);
    url = url.ReplaceComponents(repl);
    if (context->is_suggestion()) {
      t_url->SetSuggestionsURL(url.spec(),
                               t_url_ref->index_offset(),
                               t_url_ref->page_offset());
    } else {
      t_url->SetURL(url.spec(),
                    t_url_ref->index_offset(),
                    t_url_ref->page_offset());
    }
  }
}

void StartElementImpl(void *ctx, const xmlChar *name, const xmlChar **atts) {
  ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
  std::string node_name((const char*)name);
  PruneNamespace(&node_name);
  context->PushElement(node_name);
  switch (context->GetKnownType()) {
    case ParsingContext::URL:
      context->ResetExtraParams();
      ParseURL(atts, context);
      break;
    case ParsingContext::IMAGE:
      ParseImage(atts, context);
      break;
    case ParsingContext::PARAM:
      ParseParam(atts, context);
      break;
    default:
      break;
  }
  context->ResetString();
}

void EndElementImpl(void *ctx, const xmlChar *name) {
  ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
  switch (context->GetKnownType()) {
    case ParsingContext::SHORT_NAME:
      context->template_url()->set_short_name(context->GetString());
      break;
    case ParsingContext::DESCRIPTION:
      context->template_url()->set_description(context->GetString());
      break;
    case ParsingContext::IMAGE: {
      GURL image_url(UTF16ToUTF8(context->GetString()));
      if (image_url.SchemeIs(chrome::kDataScheme)) {
        // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
        // decode the data URL in the renderer. For now, we'll just point to the
        // favicon from the URL.
        context->set_derive_image_from_url(true);
      } else {
        context->SetImageURL(image_url);
      }
      context->EndImage();
      break;
    }
    case ParsingContext::LANGUAGE:
      context->template_url()->add_language(context->GetString());
      break;
    case ParsingContext::INPUT_ENCODING: {
      std::string input_encoding = UTF16ToASCII(context->GetString());
      if (IsValidEncodingString(input_encoding))
        context->template_url()->add_input_encoding(input_encoding);
      break;
    }
    case ParsingContext::URL:
      ProcessURLParams(context);
      break;
    default:
      break;
  }
  context->ResetString();
  context->PopElement();
}

void CharactersImpl(void *ctx, const xmlChar *ch, int len) {
  ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
  context->AppendString(XMLCharToUTF16(ch, len));
}

// Returns true if the ref is null, or the url wrapped by ref is
// valid with a spec of http/https.
bool IsHTTPRef(const TemplateURLRef* ref) {
  if (ref == NULL)
    return true;
  GURL url(ref->url());
  return (url.is_valid() && (url.SchemeIs(chrome::kHttpScheme) ||
                             url.SchemeIs(chrome::kHttpsScheme)));
}

// Returns true if the TemplateURL is legal. A legal TemplateURL is one
// where all URLs have a spec of http/https.
bool IsLegal(TemplateURL* url) {
  if (!IsHTTPRef(url->url()) || !IsHTTPRef(url->suggestions_url()))
    return false;
  // Make sure all the image refs are legal.
  const std::vector<TemplateURL::ImageRef>& image_refs = url->image_refs();
  for (size_t i = 0; i < image_refs.size(); i++) {
    GURL image_url(image_refs[i].url);
    if (!image_url.is_valid() ||
        !(image_url.SchemeIs(chrome::kHttpScheme) ||
          image_url.SchemeIs(chrome::kHttpsScheme))) {
      return false;
    }
  }
  return true;
}

}  // namespace

// static
bool TemplateURLParser::Parse(const unsigned char* data, size_t length,
                              TemplateURLParser::ParameterFilter* param_filter,
                              TemplateURL* url) {
  DCHECK(url);
  // xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
  // &#38; . Unfortunately xmlSubstituteEntitiesDefault effects global state.
  // If this becomes problematic we'll need to provide our own entity
  // type for &amp;, or strip out &#34; by hand after parsing.
  int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
  ParsingContext context(param_filter, url);
  xmlSAXHandler sax_handler;
  memset(&sax_handler, 0, sizeof(sax_handler));
  sax_handler.startElement = &StartElementImpl;
  sax_handler.endElement = &EndElementImpl;
  sax_handler.characters = &CharactersImpl;
  xmlSAXUserParseMemory(&sax_handler, &context,
                        reinterpret_cast<const char*>(data),
                        static_cast<int>(length));
  xmlSubstituteEntitiesDefault(last_sub_entities_value);
  // If the image was a data URL, use the favicon from the search URL instead.
  // (see TODO inEndElementImpl()).
  context.DeriveImageFromURL();

  // TODO(jcampan): http://b/issue?id=1196285 we do not support search engines
  //                that use POST yet.
  if (context.method() == ParsingContext::POST)
    return false;
  if (context.suggestion_method() == ParsingContext::POST)
    url->SetSuggestionsURL("", 0, 0);

  if (!url->short_name().empty() && !url->description().empty()) {
    // So far so good, make sure the urls are http.
    return IsLegal(url);
  }
  return false;
}