C++程序  |  178行  |  7.04 KB

// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// Common types and constants for extracting and evaluating features in the
// client-side phishing detection model.  A feature is simply a string and an
// associated floating-point value between 0 and 1.  The phishing
// classification model contains rules which give an appropriate weight to each
// feature or combination of features.  These values can then be summed to
// compute a final phishiness score.
//
// Some features are boolean features.  If these features are set, they always
// have a value of 0.0 or 1.0.  In practice, the features are only set if the
// value is true (1.0).
//
// We also use token features.  These features have a unique name that is
// constructed from the URL or page contents that we are classifying, for
// example, "UrlDomain=chromium".  These features are also always set to 1.0
// if they are present.
//
// The intermediate storage of the features for a URL is a FeatureMap, which is
// just a thin wrapper around a map of feature name to value.  The entire set
// of features for a URL is extracted before we do any scoring.

#ifndef CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_
#define CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_

#include <string>
#include "base/basictypes.h"
#include "base/containers/hash_tables.h"

namespace safe_browsing {

// Container for a map of features to values, which enforces behavior
// such as a maximum number of features in the map.
class FeatureMap {
 public:
  FeatureMap();
  ~FeatureMap();

  // Adds a boolean feature to a FeatureMap with a value of 1.0.
  // Returns true on success, or false if the feature map exceeds
  // kMaxFeatureMapSize.
  bool AddBooleanFeature(const std::string& name);

  // Adds a real-valued feature to a FeatureMap with the given value.
  // Values must always be in the range [0.0, 1.0].  Returns true on
  // success, or false if the feature map exceeds kMaxFeatureMapSize
  // or the value is outside of the allowed range.
  bool AddRealFeature(const std::string& name, double value);

  // Provides read-only access to the current set of features.
  const base::hash_map<std::string, double>& features() const {
    return features_;
  }

  // Clears the set of features in the map.
  void Clear();

  // This is an upper bound on the number of features that will be extracted.
  // We should never hit this cap; it is intended as a sanity check to prevent
  // the FeatureMap from growing too large.
  static const size_t kMaxFeatureMapSize;

 private:
  base::hash_map<std::string, double> features_;

  DISALLOW_COPY_AND_ASSIGN(FeatureMap);
};

namespace features {
// Constants for the various feature names that we use.
//
// IMPORTANT: when adding new features, you must update kAllowedFeatures in
// chrome/browser/safe_browsing/client_side_detection_service.cc if the feature
// should be sent in sanitized pingbacks.

////////////////////////////////////////////////////
// URL host features
////////////////////////////////////////////////////

// Set if the URL's hostname is an IP address.
extern const char kUrlHostIsIpAddress[];
// Token feature containing the portion of the hostname controlled by a
// registrar, for example "com" or "co.uk".
extern const char kUrlTldToken[];
// Token feature containing the first host component below the registrar.
// For example, in "www.google.com", the domain would be "google".
extern const char kUrlDomainToken[];
// Token feature containing each host component below the domain.
// For example, in "www.host.example.com", both "www" and "host" would be
// "other host tokens".
extern const char kUrlOtherHostToken[];

////////////////////////////////////////////////////
// Aggregate features for URL host tokens
////////////////////////////////////////////////////

// Set if the number of "other" host tokens for a URL is greater than one.
// Longer hostnames, regardless of the specific tokens, can be a signal that
// the URL is phishy.
extern const char kUrlNumOtherHostTokensGTOne[];
// Set if the number of "other" host tokens for a URL is greater than three.
extern const char kUrlNumOtherHostTokensGTThree[];

////////////////////////////////////////////////////
// URL path token features
////////////////////////////////////////////////////

// Token feature containing each alphanumeric string in the path that is at
// least 3 characters long.  For example, "/abc/d/efg" would have 2 path
// token features, "abc" and "efg".  Query parameters are not included.
extern const char kUrlPathToken[];

////////////////////////////////////////////////////
// DOM HTML form features
////////////////////////////////////////////////////

// Set if the page has any <form> elements.
extern const char kPageHasForms[];
// The fraction of form elements whose |action| attribute points to a
// URL on a different domain from the document URL.
extern const char kPageActionOtherDomainFreq[];

// Set if the page has any <input type="text"> elements
// (includes inputs with missing or unknown types).
extern const char kPageHasTextInputs[];
// Set if the page has any <input type="password"> elements.
extern const char kPageHasPswdInputs[];
// Set if the page has any <input type="radio"> elements.
extern const char kPageHasRadioInputs[];
// Set if the page has any <input type="checkbox"> elements.
extern const char kPageHasCheckInputs[];

////////////////////////////////////////////////////
// DOM HTML link features
////////////////////////////////////////////////////

// The fraction of links in the page which point to a domain other than the
// domain of the document.  See "URL host features" above for a discussion
// of how the doamin is computed.
extern const char kPageExternalLinksFreq[];
// Token feature containing each external domain that is linked to.
extern const char kPageLinkDomain[];
// Fraction of links in the page that use https.
extern const char kPageSecureLinksFreq[];

////////////////////////////////////////////////////
// DOM HTML script features
////////////////////////////////////////////////////

// Set if the number of <script> elements in the page is greater than 1.
extern const char kPageNumScriptTagsGTOne[];
// Set if the number of <script> elements in the page is greater than 6.
extern const char kPageNumScriptTagsGTSix[];

////////////////////////////////////////////////////
// Other DOM HTML features
////////////////////////////////////////////////////

// The fraction of images whose src attribute points to an external domain.
extern const char kPageImgOtherDomainFreq[];

////////////////////////////////////////////////////
// Page term features
////////////////////////////////////////////////////

// Token feature for a term (whitespace-delimited) on a page.  Terms can be
// single words or multi-word n-grams.  Rather than adding this feature for
// every possible token on a page, only the terms that are mentioned in the
// classification model are added.
extern const char kPageTerm[];

}  // namespace features
}  // namepsace safe_browsing

#endif  // CHROME_RENDERER_SAFE_BROWSING_FEATURES_H_