// Copyright 2008 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Regular expression engine tester -- test all the implementations against each other. #include "util/util.h" #include "util/flags.h" #include "re2/testing/tester.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" DEFINE_bool(dump_prog, false, "dump regexp program"); DEFINE_bool(log_okay, false, "log successful runs"); DEFINE_bool(dump_rprog, false, "dump reversed regexp program"); DEFINE_int32(max_regexp_failures, 100, "maximum number of regexp test failures (-1 = unlimited)"); DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test"); namespace re2 { enum { kMaxSubmatch = 1+16, // $0...$16 }; const char* engine_types[kEngineMax] = { "Backtrack", "NFA", "DFA", "DFA1", "OnePass", "BitState", "RE2", "RE2a", "RE2b", "PCRE", }; // Returns the name string for the type t. static string EngineString(Engine t) { if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) { return StringPrintf("type%d", static_cast<int>(t)); } return engine_types[t]; } // Returns bit mask of engines to use. static uint32 Engines() { static uint32 cached_engines; static bool did_parse; if (did_parse) return cached_engines; if (FLAGS_regexp_engines.empty()) { cached_engines = ~0; } else { for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str())) cached_engines |= 1<<i; } if (cached_engines == 0) LOG(INFO) << "Warning: no engines enabled."; if (!UsingPCRE) cached_engines &= ~(1<<kEnginePCRE); for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) { if (cached_engines & (1<<i)) LOG(INFO) << EngineString(i) << " enabled"; } did_parse = true; return cached_engines; } // The result of running a match. struct TestInstance::Result { bool skipped; // test skipped: wasn't applicable bool matched; // found a match bool untrusted; // don't really trust the answer bool have_submatch; // computed all submatch info bool have_submatch0; // computed just submatch[0] StringPiece submatch[kMaxSubmatch]; }; typedef TestInstance::Result Result; // Formats a single capture range s in text in the form (a,b) // where a and b are the starting and ending offsets of s in text. static string FormatCapture(const StringPiece& text, const StringPiece& s) { if (s.begin() == NULL) return "(?,?)"; return StringPrintf("(%d,%d)", static_cast<int>(s.begin() - text.begin()), static_cast<int>(s.end() - text.begin())); } // Returns whether text contains non-ASCII (>= 0x80) bytes. static bool NonASCII(const StringPiece& text) { for (int i = 0; i < text.size(); i++) if ((uint8)text[i] >= 0x80) return true; return false; } // Returns string representation of match kind. static string FormatKind(Prog::MatchKind kind) { switch (kind) { case Prog::kFullMatch: return "full match"; case Prog::kLongestMatch: return "longest match"; case Prog::kFirstMatch: return "first match"; case Prog::kManyMatch: return "many match"; } return "???"; } // Returns string representation of anchor kind. static string FormatAnchor(Prog::Anchor anchor) { switch (anchor) { case Prog::kAnchored: return "anchored"; case Prog::kUnanchored: return "unanchored"; } return "???"; } struct ParseMode { Regexp::ParseFlags parse_flags; string desc; }; static const Regexp::ParseFlags single_line = Regexp::LikePerl; static const Regexp::ParseFlags multi_line = static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine); static ParseMode parse_modes[] = { { single_line, "single-line" }, { single_line|Regexp::Latin1, "single-line, latin1" }, { multi_line, "multiline" }, { multi_line|Regexp::NonGreedy, "multiline, nongreedy" }, { multi_line|Regexp::Latin1, "multiline, latin1" }, }; static string FormatMode(Regexp::ParseFlags flags) { for (int i = 0; i < arraysize(parse_modes); i++) if (parse_modes[i].parse_flags == flags) return parse_modes[i].desc; return StringPrintf("%#x", static_cast<uint>(flags)); } // Constructs and saves all the matching engines that // will be required for the given tests. TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind, Regexp::ParseFlags flags) : regexp_str_(regexp_str), kind_(kind), flags_(flags), error_(false), regexp_(NULL), num_captures_(0), prog_(NULL), rprog_(NULL), re_(NULL), re2_(NULL) { VLOG(1) << CEscape(regexp_str); // Compile regexp to prog. // Always required - needed for backtracking (reference implementation). RegexpStatus status; regexp_ = Regexp::Parse(regexp_str, flags, &status); if (regexp_ == NULL) { LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_) << " mode: " << FormatMode(flags); error_ = true; return; } num_captures_ = regexp_->NumCaptures(); prog_ = regexp_->CompileToProg(0); if (prog_ == NULL) { LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_); error_ = true; return; } if (FLAGS_dump_prog) { LOG(INFO) << "Prog for " << " regexp " << CEscape(regexp_str_) << " (" << FormatKind(kind_) << ", " << FormatMode(flags_) << ")\n" << prog_->Dump(); } // Compile regexp to reversed prog. Only needed for DFA engines. if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) { rprog_ = regexp_->CompileToReverseProg(0); if (rprog_ == NULL) { LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_); error_ = true; return; } if (FLAGS_dump_rprog) LOG(INFO) << rprog_->Dump(); } // Create re string that will be used for RE and RE2. string re = regexp_str.as_string(); // Accomodate flags. // Regexp::Latin1 will be accomodated below. if (!(flags & Regexp::OneLine)) re = "(?m)" + re; if (flags & Regexp::NonGreedy) re = "(?U)" + re; if (flags & Regexp::DotNL) re = "(?s)" + re; // Compile regexp to RE2. if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) { RE2::Options options; if (flags & Regexp::Latin1) options.set_encoding(RE2::Options::EncodingLatin1); if (kind_ == Prog::kLongestMatch) options.set_longest_match(true); re2_ = new RE2(re, options); if (!re2_->error().empty()) { LOG(INFO) << "Cannot RE2: " << CEscape(re); error_ = true; return; } } // Compile regexp to RE. // PCRE as exposed by the RE interface isn't always usable. // 1. It disagrees about handling of empty-string reptitions // like matching (a*)* against "b". PCRE treats the (a*) as // occurring once, while we treat it as occurring not at all. // 2. It treats $ as this weird thing meaning end of string // or before the \n at the end of the string. // 3. It doesn't implement POSIX leftmost-longest matching. // MimicsPCRE() detects 1 and 2. if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() && kind_ != Prog::kLongestMatch) { PCRE_Options o; o.set_option(PCRE::UTF8); if (flags & Regexp::Latin1) o.set_option(PCRE::None); // PCRE has interface bug keeping us from finding $0, so // add one more layer of parens. re_ = new PCRE("("+re+")", o); if (!re_->error().empty()) { LOG(INFO) << "Cannot PCRE: " << CEscape(re); error_ = true; return; } } } TestInstance::~TestInstance() { if (regexp_) regexp_->Decref(); delete prog_; delete rprog_; delete re_; delete re2_; } // Runs a single search using the named engine type. // This interface hides all the irregularities of the various // engine interfaces from the rest of this file. void TestInstance::RunSearch(Engine type, const StringPiece& orig_text, const StringPiece& orig_context, Prog::Anchor anchor, Result *result) { memset(result, 0, sizeof *result); if (regexp_ == NULL) { result->skipped = true; return; } int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0 if (nsubmatch > kMaxSubmatch) nsubmatch = kMaxSubmatch; StringPiece text = orig_text; StringPiece context = orig_context; switch (type) { default: LOG(FATAL) << "Bad RunSearch type: " << (int)type; case kEngineBacktrack: if (prog_ == NULL) { result->skipped = true; break; } result->matched = prog_->UnsafeSearchBacktrack(text, context, anchor, kind_, result->submatch, nsubmatch); result->have_submatch = true; break; case kEngineNFA: if (prog_ == NULL) { result->skipped = true; break; } result->matched = prog_->SearchNFA(text, context, anchor, kind_, result->submatch, nsubmatch); result->have_submatch = true; break; case kEngineDFA: if (prog_ == NULL) { result->skipped = true; break; } result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL, &result->skipped, NULL); break; case kEngineDFA1: if (prog_ == NULL || rprog_ == NULL) { result->skipped = true; break; } result->matched = prog_->SearchDFA(text, context, anchor, kind_, result->submatch, &result->skipped, NULL); // If anchored, no need for second run, // but do it anyway to find more bugs. if (result->matched) { if (!rprog_->SearchDFA(result->submatch[0], context, Prog::kAnchored, Prog::kLongestMatch, result->submatch, &result->skipped, NULL)) { LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_) << " on " << CEscape(text); result->matched = false; } } result->have_submatch0 = true; break; case kEngineOnePass: if (prog_ == NULL || anchor == Prog::kUnanchored || !prog_->IsOnePass() || nsubmatch > Prog::kMaxOnePassCapture) { result->skipped = true; break; } result->matched = prog_->SearchOnePass(text, context, anchor, kind_, result->submatch, nsubmatch); result->have_submatch = true; break; case kEngineBitState: if (prog_ == NULL) { result->skipped = true; break; } result->matched = prog_->SearchBitState(text, context, anchor, kind_, result->submatch, nsubmatch); result->have_submatch = true; break; case kEngineRE2: case kEngineRE2a: case kEngineRE2b: { if (!re2_ || text.end() != context.end()) { result->skipped = true; break; } RE2::Anchor re_anchor; if (anchor == Prog::kAnchored) re_anchor = RE2::ANCHOR_START; else re_anchor = RE2::UNANCHORED; if (kind_ == Prog::kFullMatch) re_anchor = RE2::ANCHOR_BOTH; result->matched = re2_->Match(context, text.begin() - context.begin(), text.end() - context.begin(), re_anchor, result->submatch, nsubmatch); result->have_submatch = nsubmatch > 0; break; } case kEnginePCRE: { if (!re_ || text.begin() != context.begin() || text.end() != context.end()) { result->skipped = true; break; } const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch]; PCRE::Arg *a = new PCRE::Arg[nsubmatch]; for (int i = 0; i < nsubmatch; i++) { a[i] = PCRE::Arg(&result->submatch[i]); argptr[i] = &a[i]; } int consumed; PCRE::Anchor pcre_anchor; if (anchor == Prog::kAnchored) pcre_anchor = PCRE::ANCHOR_START; else pcre_anchor = PCRE::UNANCHORED; if (kind_ == Prog::kFullMatch) pcre_anchor = PCRE::ANCHOR_BOTH; re_->ClearHitLimit(); result->matched = re_->DoMatch(text, pcre_anchor, &consumed, argptr, nsubmatch); if (re_->HitLimit()) { result->untrusted = true; delete[] argptr; delete[] a; break; } result->have_submatch = true; // Work around RE interface bug: PCRE returns -1 as the // offsets for an unmatched subexpression, and RE should // turn that into StringPiece(NULL) but in fact it uses // StringPiece(text.begin() - 1, 0). Oops. for (int i = 0; i < nsubmatch; i++) if (result->submatch[i].begin() == text.begin() - 1) result->submatch[i] = NULL; delete[] argptr; delete[] a; break; } } if (!result->matched) memset(result->submatch, 0, sizeof result->submatch); } // Checks whether r is okay given that correct is the right answer. // Specifically, r's answers have to match (but it doesn't have to // claim to have all the answers). static bool ResultOkay(const Result& r, const Result& correct) { if (r.skipped) return true; if (r.matched != correct.matched) return false; if (r.have_submatch || r.have_submatch0) { for (int i = 0; i < kMaxSubmatch; i++) { if (correct.submatch[i].begin() != r.submatch[i].begin() || correct.submatch[i].size() != r.submatch[i].size()) return false; if (!r.have_submatch) break; } } return true; } // Runs a single test. bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context, Prog::Anchor anchor) { // Backtracking is the gold standard. Result correct; RunSearch(kEngineBacktrack, text, context, anchor, &correct); if (correct.skipped) { if (regexp_ == NULL) return true; LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_) << " " << FormatMode(flags_); return false; } VLOG(1) << "Try: regexp " << CEscape(regexp_str_) << " text " << CEscape(text) << " (" << FormatKind(kind_) << ", " << FormatAnchor(anchor) << ", " << FormatMode(flags_) << ")"; // Compare the others. bool all_okay = true; for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) { if (!(Engines() & (1<<i))) continue; Result r; RunSearch(i, text, context, anchor, &r); if (ResultOkay(r, correct)) { if (FLAGS_log_okay) LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor); continue; } // We disagree with PCRE on the meaning of some Unicode matches. // In particular, we treat all non-ASCII UTF-8 as word characters. // We also treat "empty" character sets like [^\w\W] as being // impossible to match, while PCRE apparently excludes some code // points (e.g., 0x0080) from both \w and \W. if (i == kEnginePCRE && NonASCII(text)) continue; if (!r.untrusted) all_okay = false; LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text, context, anchor); if (r.matched != correct.matched) { if (r.matched) { LOG(INFO) << " Should not match (but does)."; } else { LOG(INFO) << " Should match (but does not)."; continue; } } for (int i = 0; i < 1+num_captures_; i++) { if (r.submatch[i].begin() != correct.submatch[i].begin() || r.submatch[i].end() != correct.submatch[i].end()) { LOG(INFO) << StringPrintf(" $%d: should be %s is %s", i, FormatCapture(text, correct.submatch[i]).c_str(), FormatCapture(text, r.submatch[i]).c_str()); } else { LOG(INFO) << StringPrintf(" $%d: %s ok", i, FormatCapture(text, r.submatch[i]).c_str()); } } } if (!all_okay) { if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0) LOG(QFATAL) << "Too many regexp failures."; } return all_okay; } void TestInstance::LogMatch(const char* prefix, Engine e, const StringPiece& text, const StringPiece& context, Prog::Anchor anchor) { LOG(INFO) << prefix << EngineString(e) << " regexp " << CEscape(regexp_str_) << " " << CEscape(regexp_->ToString()) << " text " << CEscape(text) << " (" << text.begin() - context.begin() << "," << text.end() - context.begin() << ") of context " << CEscape(context) << " (" << FormatKind(kind_) << ", " << FormatAnchor(anchor) << ", " << FormatMode(flags_) << ")"; } static Prog::MatchKind kinds[] = { Prog::kFirstMatch, Prog::kLongestMatch, Prog::kFullMatch, }; // Test all possible match kinds and parse modes. Tester::Tester(const StringPiece& regexp) { error_ = false; for (int i = 0; i < arraysize(kinds); i++) { for (int j = 0; j < arraysize(parse_modes); j++) { TestInstance* t = new TestInstance(regexp, kinds[i], parse_modes[j].parse_flags); error_ |= t->error(); v_.push_back(t); } } } Tester::~Tester() { for (int i = 0; i < v_.size(); i++) delete v_[i]; } bool Tester::TestCase(const StringPiece& text, const StringPiece& context, Prog::Anchor anchor) { bool okay = true; for (int i = 0; i < v_.size(); i++) okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor)); return okay; } static Prog::Anchor anchors[] = { Prog::kAnchored, Prog::kUnanchored }; bool Tester::TestInput(const StringPiece& text) { bool okay = TestInputInContext(text, text); if (text.size() > 0) { StringPiece sp; sp = text; sp.remove_prefix(1); okay &= TestInputInContext(sp, text); sp = text; sp.remove_suffix(1); okay &= TestInputInContext(sp, text); } return okay; } bool Tester::TestInputInContext(const StringPiece& text, const StringPiece& context) { bool okay = true; for (int i = 0; i < arraysize(anchors); i++) okay &= TestCase(text, context, anchors[i]); return okay; } bool TestRegexpOnText(const StringPiece& regexp, const StringPiece& text) { Tester t(regexp); return t.TestInput(text); } } // namespace re2