// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <vector>
#include "util/test.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
#include "re2/testing/regexp_generator.h"
#include "re2/testing/string_generator.h"
namespace re2 {
// Test that C++ strings are compared as uint8s, not int8s.
// PossibleMatchRange doesn't depend on this, but callers probably will.
TEST(CplusplusStrings, EightBit) {
string s = "\x70";
string t = "\xA0";
EXPECT_LT(s, t);
}
struct PrefixTest {
const char* regexp;
int maxlen;
const char* min;
const char* max;
};
static PrefixTest tests[] = {
{ "", 10, "", "", },
{ "Abcdef", 10, "Abcdef", "Abcdef" },
{ "abc(def|ghi)", 10, "abcdef", "abcghi" },
{ "a+hello", 10, "aa", "ahello" },
{ "a*hello", 10, "a", "hello" },
{ "def|abc", 10, "abc", "def" },
{ "a(b)(c)[d]", 10, "abcd", "abcd" },
{ "ab(cab|cat)", 10, "abcab", "abcat" },
{ "ab(cab|ca)x", 10, "abcabx", "abcax" },
{ "(ab|x)(c|de)", 10, "abc", "xde" },
{ "(ab|x)?(c|z)?", 10, "", "z" },
{ "[^\\s\\S]", 10, "", "" },
{ "(abc)+", 5, "abc", "abcac" },
{ "(abc)+", 2, "ab", "ac" },
{ "(abc)+", 1, "a", "b" },
{ "[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
{ "a*", 10, "", "ab" },
{ "(?i)Abcdef", 10, "ABCDEF", "abcdef" },
{ "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" },
{ "(?i)a+hello", 10, "AA", "ahello" },
{ "(?i)a*hello", 10, "A", "hello" },
{ "(?i)def|abc", 10, "ABC", "def" },
{ "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" },
{ "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" },
{ "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" },
{ "(?i)(ab|x)(c|de)", 10, "ABC", "xde" },
{ "(?i)(ab|x)?(c|z)?", 10, "", "z" },
{ "(?i)[^\\s\\S]", 10, "", "" },
{ "(?i)(abc)+", 5, "ABC", "abcac" },
{ "(?i)(abc)+", 2, "AB", "ac" },
{ "(?i)(abc)+", 1, "A", "b" },
{ "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
{ "(?i)a*", 10, "", "ab" },
{ "(?i)A*", 10, "", "ab" },
{ "\\AAbcdef", 10, "Abcdef", "Abcdef" },
{ "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" },
{ "\\Aa+hello", 10, "aa", "ahello" },
{ "\\Aa*hello", 10, "a", "hello" },
{ "\\Adef|abc", 10, "abc", "def" },
{ "\\Aa(b)(c)[d]", 10, "abcd", "abcd" },
{ "\\Aab(cab|cat)", 10, "abcab", "abcat" },
{ "\\Aab(cab|ca)x", 10, "abcabx", "abcax" },
{ "\\A(ab|x)(c|de)", 10, "abc", "xde" },
{ "\\A(ab|x)?(c|z)?", 10, "", "z" },
{ "\\A[^\\s\\S]", 10, "", "" },
{ "\\A(abc)+", 5, "abc", "abcac" },
{ "\\A(abc)+", 2, "ab", "ac" },
{ "\\A(abc)+", 1, "a", "b" },
{ "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
{ "\\Aa*", 10, "", "ab" },
{ "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" },
{ "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" },
{ "(?i)\\Aa+hello", 10, "AA", "ahello" },
{ "(?i)\\Aa*hello", 10, "A", "hello" },
{ "(?i)\\Adef|abc", 10, "ABC", "def" },
{ "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" },
{ "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" },
{ "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" },
{ "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" },
{ "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" },
{ "(?i)\\A[^\\s\\S]", 10, "", "" },
{ "(?i)\\A(abc)+", 5, "ABC", "abcac" },
{ "(?i)\\A(abc)+", 2, "AB", "ac" },
{ "(?i)\\A(abc)+", 1, "A", "b" },
{ "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
{ "(?i)\\Aa*", 10, "", "ab" },
{ "(?i)\\AA*", 10, "", "ab" },
};
TEST(PossibleMatchRange, HandWritten) {
for (int i = 0; i < arraysize(tests); i++) {
for (int j = 0; j < 2; j++) {
const PrefixTest& t = tests[i];
string min, max;
if (j == 0) {
LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
CHECK(re);
Prog* prog = re->CompileToProg(0);
CHECK(prog);
CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen))
<< " " << t.regexp;
delete prog;
re->Decref();
} else {
CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
}
EXPECT_EQ(t.min, min) << t.regexp;
EXPECT_EQ(t.max, max) << t.regexp;
}
}
}
// Test cases where PossibleMatchRange should return false.
TEST(PossibleMatchRange, Failures) {
string min, max;
// Fails because no room to write max.
EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
// Fails because there is no max -- any non-empty string matches
// or begins a match. Have to use Latin-1 input, because there
// are no valid UTF-8 strings beginning with byte 0xFF.
EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2(".*", RE2::Latin1).
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
EXPECT_FALSE(RE2("\\C*").
PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
// Fails because it's a malformed regexp.
EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
<< "min=" << CEscape(min) << ", max=" << CEscape(max);
}
// Exhaustive test: generate all regexps within parameters,
// then generate all strings of a given length over a given alphabet,
// then check that the prefix information agrees with whether
// the regexp matches each of the strings.
class PossibleMatchTester : public RegexpGenerator {
public:
PossibleMatchTester(int maxatoms,
int maxops,
const vector<string>& alphabet,
const vector<string>& ops,
int maxstrlen,
const vector<string>& stralphabet)
: RegexpGenerator(maxatoms, maxops, alphabet, ops),
strgen_(maxstrlen, stralphabet),
regexps_(0), tests_(0) { }
int regexps() { return regexps_; }
int tests() { return tests_; }
// Needed for RegexpGenerator interface.
void HandleRegexp(const string& regexp);
private:
StringGenerator strgen_;
int regexps_; // Number of HandleRegexp calls
int tests_; // Number of regexp tests.
DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester);
};
// Processes a single generated regexp.
// Checks that all accepted strings agree with the prefix range.
void PossibleMatchTester::HandleRegexp(const string& regexp) {
regexps_++;
VLOG(3) << CEscape(regexp);
RE2 re(regexp, RE2::Latin1);
CHECK_EQ(re.error(), "");
string min, max;
if(!re.PossibleMatchRange(&min, &max, 10)) {
// There's no good max for "\\C*". Can't use strcmp
// because sometimes it gets embedded in more
// complicated expressions.
if(strstr(regexp.c_str(), "\\C*"))
return;
LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
}
strgen_.Reset();
while (strgen_.HasNext()) {
const StringPiece& s = strgen_.Next();
tests_++;
if (!RE2::FullMatch(s, re))
continue;
CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max;
CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min;
}
}
TEST(PossibleMatchRange, Exhaustive) {
int natom = 3;
int noperator = 3;
int stringlen = 5;
if (DEBUG_MODE) {
natom = 2;
noperator = 3;
stringlen = 3;
}
PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
RegexpGenerator::EgrepOps(),
stringlen, Explode("ab4"));
t.Generate();
LOG(INFO) << t.regexps() << " regexps, "
<< t.tests() << " tests";
}
} // namespace re2