// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include <vector> #include "util/test.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" #include "re2/testing/regexp_generator.h" #include "re2/testing/string_generator.h" namespace re2 { // Test that C++ strings are compared as uint8s, not int8s. // PossibleMatchRange doesn't depend on this, but callers probably will. TEST(CplusplusStrings, EightBit) { string s = "\x70"; string t = "\xA0"; EXPECT_LT(s, t); } struct PrefixTest { const char* regexp; int maxlen; const char* min; const char* max; }; static PrefixTest tests[] = { { "", 10, "", "", }, { "Abcdef", 10, "Abcdef", "Abcdef" }, { "abc(def|ghi)", 10, "abcdef", "abcghi" }, { "a+hello", 10, "aa", "ahello" }, { "a*hello", 10, "a", "hello" }, { "def|abc", 10, "abc", "def" }, { "a(b)(c)[d]", 10, "abcd", "abcd" }, { "ab(cab|cat)", 10, "abcab", "abcat" }, { "ab(cab|ca)x", 10, "abcabx", "abcax" }, { "(ab|x)(c|de)", 10, "abc", "xde" }, { "(ab|x)?(c|z)?", 10, "", "z" }, { "[^\\s\\S]", 10, "", "" }, { "(abc)+", 5, "abc", "abcac" }, { "(abc)+", 2, "ab", "ac" }, { "(abc)+", 1, "a", "b" }, { "[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, { "a*", 10, "", "ab" }, { "(?i)Abcdef", 10, "ABCDEF", "abcdef" }, { "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" }, { "(?i)a+hello", 10, "AA", "ahello" }, { "(?i)a*hello", 10, "A", "hello" }, { "(?i)def|abc", 10, "ABC", "def" }, { "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" }, { "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" }, { "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" }, { "(?i)(ab|x)(c|de)", 10, "ABC", "xde" }, { "(?i)(ab|x)?(c|z)?", 10, "", "z" }, { "(?i)[^\\s\\S]", 10, "", "" }, { "(?i)(abc)+", 5, "ABC", "abcac" }, { "(?i)(abc)+", 2, "AB", "ac" }, { "(?i)(abc)+", 1, "A", "b" }, { "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, { "(?i)a*", 10, "", "ab" }, { "(?i)A*", 10, "", "ab" }, { "\\AAbcdef", 10, "Abcdef", "Abcdef" }, { "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" }, { "\\Aa+hello", 10, "aa", "ahello" }, { "\\Aa*hello", 10, "a", "hello" }, { "\\Adef|abc", 10, "abc", "def" }, { "\\Aa(b)(c)[d]", 10, "abcd", "abcd" }, { "\\Aab(cab|cat)", 10, "abcab", "abcat" }, { "\\Aab(cab|ca)x", 10, "abcabx", "abcax" }, { "\\A(ab|x)(c|de)", 10, "abc", "xde" }, { "\\A(ab|x)?(c|z)?", 10, "", "z" }, { "\\A[^\\s\\S]", 10, "", "" }, { "\\A(abc)+", 5, "abc", "abcac" }, { "\\A(abc)+", 2, "ab", "ac" }, { "\\A(abc)+", 1, "a", "b" }, { "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, { "\\Aa*", 10, "", "ab" }, { "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" }, { "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" }, { "(?i)\\Aa+hello", 10, "AA", "ahello" }, { "(?i)\\Aa*hello", 10, "A", "hello" }, { "(?i)\\Adef|abc", 10, "ABC", "def" }, { "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" }, { "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" }, { "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" }, { "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" }, { "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" }, { "(?i)\\A[^\\s\\S]", 10, "", "" }, { "(?i)\\A(abc)+", 5, "ABC", "abcac" }, { "(?i)\\A(abc)+", 2, "AB", "ac" }, { "(?i)\\A(abc)+", 1, "A", "b" }, { "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, { "(?i)\\Aa*", 10, "", "ab" }, { "(?i)\\AA*", 10, "", "ab" }, }; TEST(PossibleMatchRange, HandWritten) { for (int i = 0; i < arraysize(tests); i++) { for (int j = 0; j < 2; j++) { const PrefixTest& t = tests[i]; string min, max; if (j == 0) { LOG(INFO) << "Checking regexp=" << CEscape(t.regexp); Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); CHECK(re); Prog* prog = re->CompileToProg(0); CHECK(prog); CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen)) << " " << t.regexp; delete prog; re->Decref(); } else { CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen)); } EXPECT_EQ(t.min, min) << t.regexp; EXPECT_EQ(t.max, max) << t.regexp; } } } // Test cases where PossibleMatchRange should return false. TEST(PossibleMatchRange, Failures) { string min, max; // Fails because no room to write max. EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0)); // Fails because there is no max -- any non-empty string matches // or begins a match. Have to use Latin-1 input, because there // are no valid UTF-8 strings beginning with byte 0xFF. EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) << "min=" << CEscape(min) << ", max=" << CEscape(max); EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) << "min=" << CEscape(min) << ", max=" << CEscape(max); EXPECT_FALSE(RE2(".+hello", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) << "min=" << CEscape(min) << ", max=" << CEscape(max); EXPECT_FALSE(RE2(".*hello", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) << "min=" << CEscape(min) << ", max=" << CEscape(max); EXPECT_FALSE(RE2(".*", RE2::Latin1). PossibleMatchRange(&min, &max, 10)) << "min=" << CEscape(min) << ", max=" << CEscape(max); EXPECT_FALSE(RE2("\\C*"). PossibleMatchRange(&min, &max, 10)) << "min=" << CEscape(min) << ", max=" << CEscape(max); // Fails because it's a malformed regexp. EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10)) << "min=" << CEscape(min) << ", max=" << CEscape(max); } // Exhaustive test: generate all regexps within parameters, // then generate all strings of a given length over a given alphabet, // then check that the prefix information agrees with whether // the regexp matches each of the strings. class PossibleMatchTester : public RegexpGenerator { public: PossibleMatchTester(int maxatoms, int maxops, const vector<string>& alphabet, const vector<string>& ops, int maxstrlen, const vector<string>& stralphabet) : RegexpGenerator(maxatoms, maxops, alphabet, ops), strgen_(maxstrlen, stralphabet), regexps_(0), tests_(0) { } int regexps() { return regexps_; } int tests() { return tests_; } // Needed for RegexpGenerator interface. void HandleRegexp(const string& regexp); private: StringGenerator strgen_; int regexps_; // Number of HandleRegexp calls int tests_; // Number of regexp tests. DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester); }; // Processes a single generated regexp. // Checks that all accepted strings agree with the prefix range. void PossibleMatchTester::HandleRegexp(const string& regexp) { regexps_++; VLOG(3) << CEscape(regexp); RE2 re(regexp, RE2::Latin1); CHECK_EQ(re.error(), ""); string min, max; if(!re.PossibleMatchRange(&min, &max, 10)) { // There's no good max for "\\C*". Can't use strcmp // because sometimes it gets embedded in more // complicated expressions. if(strstr(regexp.c_str(), "\\C*")) return; LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp); } strgen_.Reset(); while (strgen_.HasNext()) { const StringPiece& s = strgen_.Next(); tests_++; if (!RE2::FullMatch(s, re)) continue; CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max; CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min; } } TEST(PossibleMatchRange, Exhaustive) { int natom = 3; int noperator = 3; int stringlen = 5; if (DEBUG_MODE) { natom = 2; noperator = 3; stringlen = 3; } PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"), RegexpGenerator::EgrepOps(), stringlen, Explode("ab4")); t.Generate(); LOG(INFO) << t.regexps() << " regexps, " << t.tests() << " tests"; } } // namespace re2