普通文本  |  716行  |  17.1 KB

// Copyright 2009 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "util/util.h"
#include "re2/prefilter.h"
#include "re2/re2.h"
#include "re2/unicode_casefold.h"
#include "re2/walker-inl.h"

namespace re2 {

static const int Trace = false;

typedef set<string>::iterator SSIter;
typedef set<string>::const_iterator ConstSSIter;

static int alloc_id = 100000;  // Used for debugging.
// Initializes a Prefilter, allocating subs_ as necessary.
Prefilter::Prefilter(Op op) {
  op_ = op;
  subs_ = NULL;
  if (op_ == AND || op_ == OR)
    subs_ = new vector<Prefilter*>;

  alloc_id_ = alloc_id++;
  VLOG(10) << "alloc_id: " << alloc_id_;
}

// Destroys a Prefilter.
Prefilter::~Prefilter() {
  VLOG(10) << "Deleted: " << alloc_id_;
  if (subs_) {
    for (int i = 0; i < subs_->size(); i++)
      delete (*subs_)[i];
    delete subs_;
    subs_ = NULL;
  }
}

// Simplify if the node is an empty Or or And.
Prefilter* Prefilter::Simplify() {
  if (op_ != AND && op_ != OR) {
    return this;
  }

  // Nothing left in the AND/OR.
  if (subs_->size() == 0) {
    if (op_ == AND)
      op_ = ALL;  // AND of nothing is true
    else
      op_ = NONE;  // OR of nothing is false

    return this;
  }

  // Just one subnode: throw away wrapper.
  if (subs_->size() == 1) {
    Prefilter* a = (*subs_)[0];
    subs_->clear();
    delete this;
    return a->Simplify();
  }

  return this;
}

// Combines two Prefilters together to create an "op" (AND or OR).
// The passed Prefilters will be part of the returned Prefilter or deleted.
// Does lots of work to avoid creating unnecessarily complicated structures.
Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
  // If a, b can be rewritten as op, do so.
  a = a->Simplify();
  b = b->Simplify();

  // Canonicalize: a->op <= b->op.
  if (a->op() > b->op()) {
    Prefilter* t = a;
    a = b;
    b = t;
  }

  // Trivial cases.
  //    ALL AND b = b
  //    NONE OR b = b
  //    ALL OR b   = ALL
  //    NONE AND b = NONE
  // Don't need to look at b, because of canonicalization above.
  // ALL and NONE are smallest opcodes.
  if (a->op() == ALL || a->op() == NONE) {
    if ((a->op() == ALL && op == AND) ||
        (a->op() == NONE && op == OR)) {
      delete a;
      return b;
    } else {
      delete b;
      return a;
    }
  }

  // If a and b match op, merge their contents.
  if (a->op() == op && b->op() == op) {
    for (int i = 0; i < b->subs()->size(); i++) {
      Prefilter* bb = (*b->subs())[i];
      a->subs()->push_back(bb);
    }
    b->subs()->clear();
    delete b;
    return a;
  }

  // If a already has the same op as the op that is under construction
  // add in b (similarly if b already has the same op, add in a).
  if (b->op() == op) {
    Prefilter* t = a;
    a = b;
    b = t;
  }
  if (a->op() == op) {
    a->subs()->push_back(b);
    return a;
  }

  // Otherwise just return the op.
  Prefilter* c = new Prefilter(op);
  c->subs()->push_back(a);
  c->subs()->push_back(b);
  return c;
}

Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
  return AndOr(AND, a, b);
}

Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
  return AndOr(OR, a, b);
}

static void SimplifyStringSet(set<string> *ss) {
  // Now make sure that the strings aren't redundant.  For example, if
  // we know "ab" is a required string, then it doesn't help at all to
  // know that "abc" is also a required string, so delete "abc". This
  // is because, when we are performing a string search to filter
  // regexps, matching ab will already allow this regexp to be a
  // candidate for match, so further matching abc is redundant.

  for (SSIter i = ss->begin(); i != ss->end(); ++i) {
    SSIter j = i;
    ++j;
    while (j != ss->end()) {
      // Increment j early so that we can erase the element it points to.
      SSIter old_j = j;
      ++j;
      if (old_j->find(*i) != string::npos)
        ss->erase(old_j);
    }
  }
}

Prefilter* Prefilter::OrStrings(set<string>* ss) {
  SimplifyStringSet(ss);
  Prefilter* or_prefilter = NULL;
  if (!ss->empty()) {
    or_prefilter = new Prefilter(NONE);
    for (SSIter i = ss->begin(); i != ss->end(); ++i)
      or_prefilter = Or(or_prefilter, FromString(*i));
  }
  return or_prefilter;
}

static Rune ToLowerRune(Rune r) {
  if (r < Runeself) {
    if ('A' <= r && r <= 'Z')
      r += 'a' - 'A';
    return r;
  }

  CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
  if (f == NULL || r < f->lo)
    return r;
  return ApplyFold(f, r);
}

static Rune ToLowerRuneLatin1(Rune r) {
  if ('A' <= r && r <= 'Z')
    r += 'a' - 'A';
  return r;
}

Prefilter* Prefilter::FromString(const string& str) {
  Prefilter* m = new Prefilter(Prefilter::ATOM);
  m->atom_ = str;
  return m;
}

// Information about a regexp used during computation of Prefilter.
// Can be thought of as information about the set of strings matching
// the given regular expression.
class Prefilter::Info {
 public:
  Info();
  ~Info();

  // More constructors.  They delete their Info* arguments.
  static Info* Alt(Info* a, Info* b);
  static Info* Concat(Info* a, Info* b);
  static Info* And(Info* a, Info* b);
  static Info* Star(Info* a);
  static Info* Plus(Info* a);
  static Info* Quest(Info* a);
  static Info* EmptyString();
  static Info* NoMatch();
  static Info* AnyChar();
  static Info* CClass(CharClass* cc, bool latin1);
  static Info* Literal(Rune r);
  static Info* LiteralLatin1(Rune r);
  static Info* AnyMatch();

  // Format Info as a string.
  string ToString();

  // Caller takes ownership of the Prefilter.
  Prefilter* TakeMatch();

  set<string>& exact() { return exact_; }

  bool is_exact() const { return is_exact_; }

  class Walker;

 private:
  set<string> exact_;

  // When is_exact_ is true, the strings that match
  // are placed in exact_. When it is no longer an exact
  // set of strings that match this RE, then is_exact_
  // is false and the match_ contains the required match
  // criteria.
  bool is_exact_;

  // Accumulated Prefilter query that any
  // match for this regexp is guaranteed to match.
  Prefilter* match_;
};


Prefilter::Info::Info()
  : is_exact_(false),
    match_(NULL) {
}

Prefilter::Info::~Info() {
  delete match_;
}

Prefilter* Prefilter::Info::TakeMatch() {
  if (is_exact_) {
    match_ = Prefilter::OrStrings(&exact_);
    is_exact_ = false;
  }
  Prefilter* m = match_;
  match_ = NULL;
  return m;
}

// Format a Info in string form.
string Prefilter::Info::ToString() {
  if (this == NULL) {
    // Sometimes when iterating on children of a node,
    // some children might have NULL Info. Adding
    // the check here for NULL to take care of cases where
    // the caller is not checking.
    return "";
  }

  if (is_exact_) {
    int n = 0;
    string s;
    for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
      if (n++ > 0)
        s += ",";
      s += *i;
    }
    return s;
  }

  if (match_)
    return match_->DebugString();

  return "";
}

// Add the strings from src to dst.
static void CopyIn(const set<string>& src, set<string>* dst) {
  for (ConstSSIter i = src.begin(); i != src.end(); ++i)
    dst->insert(*i);
}

// Add the cross-product of a and b to dst.
// (For each string i in a and j in b, add i+j.)
static void CrossProduct(const set<string>& a,
                         const set<string>& b,
                         set<string>* dst) {
  for (ConstSSIter i = a.begin(); i != a.end(); ++i)
    for (ConstSSIter j = b.begin(); j != b.end(); ++j)
      dst->insert(*i + *j);
}

// Concats a and b. Requires that both are exact sets.
// Forms an exact set that is a crossproduct of a and b.
Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
  if (a == NULL)
    return b;
  DCHECK(a->is_exact_);
  DCHECK(b && b->is_exact_);
  Info *ab = new Info();

  CrossProduct(a->exact_, b->exact_, &ab->exact_);
  ab->is_exact_ = true;

  delete a;
  delete b;
  return ab;
}

// Constructs an inexact Info for ab given a and b.
// Used only when a or b is not exact or when the
// exact cross product is likely to be too big.
Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
  if (a == NULL)
    return b;
  if (b == NULL)
    return a;

  Info *ab = new Info();

  ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
  ab->is_exact_ = false;
  delete a;
  delete b;
  return ab;
}

// Constructs Info for a|b given a and b.
Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
  Info *ab = new Info();

  if (a->is_exact_ && b->is_exact_) {
    CopyIn(a->exact_, &ab->exact_);
    CopyIn(b->exact_, &ab->exact_);
    ab->is_exact_ = true;
  } else {
    // Either a or b has is_exact_ = false. If the other
    // one has is_exact_ = true, we move it to match_ and
    // then create a OR of a,b. The resulting Info has
    // is_exact_ = false.
    ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
    ab->is_exact_ = false;
  }

  delete a;
  delete b;
  return ab;
}

// Constructs Info for a? given a.
Prefilter::Info* Prefilter::Info::Quest(Info *a) {
  Info *ab = new Info();

  ab->is_exact_ = false;
  ab->match_ = new Prefilter(ALL);
  delete a;
  return ab;
}

// Constructs Info for a* given a.
// Same as a? -- not much to do.
Prefilter::Info* Prefilter::Info::Star(Info *a) {
  return Quest(a);
}

// Constructs Info for a+ given a. If a was exact set, it isn't
// anymore.
Prefilter::Info* Prefilter::Info::Plus(Info *a) {
  Info *ab = new Info();

  ab->match_ = a->TakeMatch();
  ab->is_exact_ = false;

  delete a;
  return ab;
}

static string RuneToString(Rune r) {
  char buf[UTFmax];
  int n = runetochar(buf, &r);
  return string(buf, n);
}

static string RuneToStringLatin1(Rune r) {
  char c = r & 0xff;
  return string(&c, 1);
}

// Constructs Info for literal rune.
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
  Info* info = new Info();
  info->exact_.insert(RuneToString(ToLowerRune(r)));
  info->is_exact_ = true;
  return info;
}

// Constructs Info for literal rune for Latin1 encoded string.
Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
  Info* info = new Info();
  info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
  info->is_exact_ = true;
  return info;
}

// Constructs Info for dot (any character).
Prefilter::Info* Prefilter::Info::AnyChar() {
  Prefilter::Info* info = new Prefilter::Info();
  info->match_ = new Prefilter(ALL);
  return info;
}

// Constructs Prefilter::Info for no possible match.
Prefilter::Info* Prefilter::Info::NoMatch() {
  Prefilter::Info* info = new Prefilter::Info();
  info->match_ = new Prefilter(NONE);
  return info;
}

// Constructs Prefilter::Info for any possible match.
// This Prefilter::Info is valid for any regular expression,
// since it makes no assertions whatsoever about the
// strings being matched.
Prefilter::Info* Prefilter::Info::AnyMatch() {
  Prefilter::Info *info = new Prefilter::Info();
  info->match_ = new Prefilter(ALL);
  return info;
}

// Constructs Prefilter::Info for just the empty string.
Prefilter::Info* Prefilter::Info::EmptyString() {
  Prefilter::Info* info = new Prefilter::Info();
  info->is_exact_ = true;
  info->exact_.insert("");
  return info;
}

// Constructs Prefilter::Info for a character class.
typedef CharClass::iterator CCIter;
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
                                         bool latin1) {
  if (Trace) {
    VLOG(0) << "CharClassInfo:";
    for (CCIter i = cc->begin(); i != cc->end(); ++i)
      VLOG(0) << "  " << i->lo << "-" << i->hi;
  }

  // If the class is too large, it's okay to overestimate.
  if (cc->size() > 10)
    return AnyChar();

  Prefilter::Info *a = new Prefilter::Info();
  for (CCIter i = cc->begin(); i != cc->end(); ++i)
    for (Rune r = i->lo; r <= i->hi; r++) {
      if (latin1) {
        a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
      } else {
        a->exact_.insert(RuneToString(ToLowerRune(r)));
      }
    }


  a->is_exact_ = true;

  if (Trace) {
    VLOG(0) << " = " << a->ToString();
  }

  return a;
}

class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
 public:
  Walker(bool latin1) : latin1_(latin1) {}

  virtual Info* PostVisit(
      Regexp* re, Info* parent_arg,
      Info* pre_arg,
      Info** child_args, int nchild_args);

  virtual Info* ShortVisit(
      Regexp* re,
      Info* parent_arg);

  bool latin1() { return latin1_; }
 private:
  bool latin1_;
  DISALLOW_EVIL_CONSTRUCTORS(Walker);
};

Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
  if (Trace) {
    LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
  }

  bool latin1 = re->parse_flags() & Regexp::Latin1;
  Prefilter::Info::Walker w(latin1);
  Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);

  if (w.stopped_early()) {
    delete info;
    return NULL;
  }

  return info;
}

Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
    Regexp* re, Prefilter::Info* parent_arg) {
  return AnyMatch();
}

// Constructs the Prefilter::Info for the given regular expression.
// Assumes re is simplified.
Prefilter::Info* Prefilter::Info::Walker::PostVisit(
    Regexp* re, Prefilter::Info* parent_arg,
    Prefilter::Info* pre_arg, Prefilter::Info** child_args,
    int nchild_args) {
  Prefilter::Info *info;
  switch (re->op()) {
    default:
    case kRegexpRepeat:
      LOG(DFATAL) << "Bad regexp op " << re->op();
      info = EmptyString();
      break;

    case kRegexpNoMatch:
      info = NoMatch();
      break;

    // These ops match the empty string:
    case kRegexpEmptyMatch:      // anywhere
    case kRegexpBeginLine:       // at beginning of line
    case kRegexpEndLine:         // at end of line
    case kRegexpBeginText:       // at beginning of text
    case kRegexpEndText:         // at end of text
    case kRegexpWordBoundary:    // at word boundary
    case kRegexpNoWordBoundary:  // not at word boundary
      info = EmptyString();
      break;

    case kRegexpLiteral:
      if (latin1()) {
        info = LiteralLatin1(re->rune());
      }
      else {
        info = Literal(re->rune());
      }
      break;

    case kRegexpLiteralString:
      if (re->nrunes() == 0) {
        info = NoMatch();
        break;
      }
      if (latin1()) {
        info = LiteralLatin1(re->runes()[0]);
        for (int i = 1; i < re->nrunes(); i++) {
          info = Concat(info, LiteralLatin1(re->runes()[i]));
        }
      } else {
        info = Literal(re->runes()[0]);
        for (int i = 1; i < re->nrunes(); i++) {
          info = Concat(info, Literal(re->runes()[i]));
        }
      }
      break;

    case kRegexpConcat: {
      // Accumulate in info.
      // Exact is concat of recent contiguous exact nodes.
      info = NULL;
      Info* exact = NULL;
      for (int i = 0; i < nchild_args; i++) {
        Info* ci = child_args[i];  // child info
        if (!ci->is_exact() ||
            (exact && ci->exact().size() * exact->exact().size() > 16)) {
          // Exact run is over.
          info = And(info, exact);
          exact = NULL;
          // Add this child's info.
          info = And(info, ci);
        } else {
          // Append to exact run.
          exact = Concat(exact, ci);
        }
      }
      info = And(info, exact);
    }
      break;

    case kRegexpAlternate:
      info = child_args[0];
      for (int i = 1; i < nchild_args; i++)
        info = Alt(info, child_args[i]);
      VLOG(10) << "Alt: " << info->ToString();
      break;

    case kRegexpStar:
      info = Star(child_args[0]);
      break;

    case kRegexpQuest:
      info = Quest(child_args[0]);
      break;

    case kRegexpPlus:
      info = Plus(child_args[0]);
      break;

    case kRegexpAnyChar:
      // Claim nothing, except that it's not empty.
      info = AnyChar();
      break;

    case kRegexpCharClass:
      info = CClass(re->cc(), latin1());
      break;

    case kRegexpCapture:
      // These don't affect the set of matching strings.
      info = child_args[0];
      break;
  }

  if (Trace) {
    VLOG(0) << "BuildInfo " << re->ToString()
            << ": " << info->ToString();
  }

  return info;
}


Prefilter* Prefilter::FromRegexp(Regexp* re) {
  if (re == NULL)
    return NULL;

  Regexp* simple = re->Simplify();
  Prefilter::Info *info = BuildInfo(simple);

  simple->Decref();
  if (info == NULL)
    return NULL;

  Prefilter* m = info->TakeMatch();

  delete info;
  return m;
}

string Prefilter::DebugString() const {
  if (this == NULL)
    return "<nil>";

  switch (op_) {
    default:
      LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
      return StringPrintf("op%d", op_);
    case NONE:
      return "*no-matches*";
    case ATOM:
      return atom_;
    case ALL:
      return "";
    case AND: {
      string s = "";
      for (int i = 0; i < subs_->size(); i++) {
        if (i > 0)
          s += " ";
        s += (*subs_)[i]->DebugString();
      }
      return s;
    }
    case OR: {
      string s = "(";
      for (int i = 0; i < subs_->size(); i++) {
        if (i > 0)
          s += "|";
        s += (*subs_)[i]->DebugString();
      }
      s += ")";
      return s;
    }
  }
}

Prefilter* Prefilter::FromRE2(const RE2* re2) {
  if (re2 == NULL)
    return NULL;

  Regexp* regexp = re2->Regexp();
  if (regexp == NULL)
    return NULL;

  return FromRegexp(regexp);
}


}  // namespace re2