// -*- coding: utf-8 -*- // // Copyright (c) 2005 - 2010, Google Inc. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author: Sanjay Ghemawat // // TODO: Test extractions for PartialMatch/Consume #ifdef HAVE_CONFIG_H #include "config.h" #endif #include <stdio.h> #include <string.h> /* for memset and strcmp */ #include <cassert> #include <vector> #include "pcrecpp.h" using pcrecpp::StringPiece; using pcrecpp::RE; using pcrecpp::RE_Options; using pcrecpp::Hex; using pcrecpp::Octal; using pcrecpp::CRadix; static bool VERBOSE_TEST = false; // CHECK dies with a fatal error if condition is not true. It is *not* // controlled by NDEBUG, so the check will be executed regardless of // compilation mode. Therefore, it is safe to do things like: // CHECK_EQ(fp->Write(x), 4) #define CHECK(condition) do { \ if (!(condition)) { \ fprintf(stderr, "%s:%d: Check failed: %s\n", \ __FILE__, __LINE__, #condition); \ exit(1); \ } \ } while (0) #define CHECK_EQ(a, b) CHECK(a == b) static void Timing1(int num_iters) { // Same pattern lots of times RE pattern("ruby:\\d+"); StringPiece p("ruby:1234"); for (int j = num_iters; j > 0; j--) { CHECK(pattern.FullMatch(p)); } } static void Timing2(int num_iters) { // Same pattern lots of times RE pattern("ruby:(\\d+)"); int i; for (int j = num_iters; j > 0; j--) { CHECK(pattern.FullMatch("ruby:1234", &i)); CHECK_EQ(i, 1234); } } static void Timing3(int num_iters) { string text_string; for (int j = num_iters; j > 0; j--) { text_string += "this is another line\n"; } RE line_matcher(".*\n"); string line; StringPiece text(text_string); int counter = 0; while (line_matcher.Consume(&text)) { counter++; } printf("Matched %d lines\n", counter); } #if 0 // uncomment this if you have a way of defining VirtualProcessSize() static void LeakTest() { // Check for memory leaks unsigned long long initial_size = 0; for (int i = 0; i < 100000; i++) { if (i == 50000) { initial_size = VirtualProcessSize(); printf("Size after 50000: %llu\n", initial_size); } char buf[100]; // definitely big enough sprintf(buf, "pat%09d", i); RE newre(buf); } uint64 final_size = VirtualProcessSize(); printf("Size after 100000: %llu\n", final_size); const double growth = double(final_size - initial_size) / final_size; printf("Growth: %0.2f%%", growth * 100); CHECK(growth < 0.02); // Allow < 2% growth } #endif static void RadixTests() { printf("Testing hex\n"); #define CHECK_HEX(type, value) \ do { \ type v; \ CHECK(RE("([0-9a-fA-F]+)[uUlL]*").FullMatch(#value, Hex(&v))); \ CHECK_EQ(v, 0x ## value); \ CHECK(RE("([0-9a-fA-FxX]+)[uUlL]*").FullMatch("0x" #value, CRadix(&v))); \ CHECK_EQ(v, 0x ## value); \ } while(0) CHECK_HEX(short, 2bad); CHECK_HEX(unsigned short, 2badU); CHECK_HEX(int, dead); CHECK_HEX(unsigned int, deadU); CHECK_HEX(long, 7eadbeefL); CHECK_HEX(unsigned long, deadbeefUL); #ifdef HAVE_LONG_LONG CHECK_HEX(long long, 12345678deadbeefLL); #endif #ifdef HAVE_UNSIGNED_LONG_LONG CHECK_HEX(unsigned long long, cafebabedeadbeefULL); #endif #undef CHECK_HEX printf("Testing octal\n"); #define CHECK_OCTAL(type, value) \ do { \ type v; \ CHECK(RE("([0-7]+)[uUlL]*").FullMatch(#value, Octal(&v))); \ CHECK_EQ(v, 0 ## value); \ CHECK(RE("([0-9a-fA-FxX]+)[uUlL]*").FullMatch("0" #value, CRadix(&v))); \ CHECK_EQ(v, 0 ## value); \ } while(0) CHECK_OCTAL(short, 77777); CHECK_OCTAL(unsigned short, 177777U); CHECK_OCTAL(int, 17777777777); CHECK_OCTAL(unsigned int, 37777777777U); CHECK_OCTAL(long, 17777777777L); CHECK_OCTAL(unsigned long, 37777777777UL); #ifdef HAVE_LONG_LONG CHECK_OCTAL(long long, 777777777777777777777LL); #endif #ifdef HAVE_UNSIGNED_LONG_LONG CHECK_OCTAL(unsigned long long, 1777777777777777777777ULL); #endif #undef CHECK_OCTAL printf("Testing decimal\n"); #define CHECK_DECIMAL(type, value) \ do { \ type v; \ CHECK(RE("(-?[0-9]+)[uUlL]*").FullMatch(#value, &v)); \ CHECK_EQ(v, value); \ CHECK(RE("(-?[0-9a-fA-FxX]+)[uUlL]*").FullMatch(#value, CRadix(&v))); \ CHECK_EQ(v, value); \ } while(0) CHECK_DECIMAL(short, -1); CHECK_DECIMAL(unsigned short, 9999); CHECK_DECIMAL(int, -1000); CHECK_DECIMAL(unsigned int, 12345U); CHECK_DECIMAL(long, -10000000L); CHECK_DECIMAL(unsigned long, 3083324652U); #ifdef HAVE_LONG_LONG CHECK_DECIMAL(long long, -100000000000000LL); #endif #ifdef HAVE_UNSIGNED_LONG_LONG CHECK_DECIMAL(unsigned long long, 1234567890987654321ULL); #endif #undef CHECK_DECIMAL } static void TestReplace() { printf("Testing Replace\n"); struct ReplaceTest { const char *regexp; const char *rewrite; const char *original; const char *single; const char *global; int global_count; // the expected return value from ReplaceAll }; static const ReplaceTest tests[] = { { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)", "\\2\\1ay", "the quick brown fox jumps over the lazy dogs.", "ethay quick brown fox jumps over the lazy dogs.", "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.", 9 }, { "\\w+", "\\0-NOSPAM", "paul.haahr@google.com", "paul-NOSPAM.haahr@google.com", "paul-NOSPAM.haahr-NOSPAM@google-NOSPAM.com-NOSPAM", 4 }, { "^", "(START)", "foo", "(START)foo", "(START)foo", 1 }, { "^", "(START)", "", "(START)", "(START)", 1 }, { "$", "(END)", "", "(END)", "(END)", 1 }, { "b", "bb", "ababababab", "abbabababab", "abbabbabbabbabb", 5 }, { "b", "bb", "bbbbbb", "bbbbbbb", "bbbbbbbbbbbb", 6 }, { "b+", "bb", "bbbbbb", "bb", "bb", 1 }, { "b*", "bb", "bbbbbb", "bb", "bbbb", 2 }, { "b*", "bb", "aaaaa", "bbaaaaa", "bbabbabbabbabbabb", 6 }, { "b*", "bb", "aa\naa\n", "bbaa\naa\n", "bbabbabb\nbbabbabb\nbb", 7 }, { "b*", "bb", "aa\raa\r", "bbaa\raa\r", "bbabbabb\rbbabbabb\rbb", 7 }, { "b*", "bb", "aa\r\naa\r\n", "bbaa\r\naa\r\n", "bbabbabb\r\nbbabbabb\r\nbb", 7 }, // Check empty-string matching (it's tricky!) { "aa|b*", "@", "aa", "@", "@@", 2 }, { "b*|aa", "@", "aa", "@aa", "@@@", 3 }, #ifdef SUPPORT_UTF8 { "b*", "bb", "\xE3\x83\x9B\xE3\x83\xBC\xE3\x83\xA0\xE3\x81\xB8", // utf8 "bb\xE3\x83\x9B\xE3\x83\xBC\xE3\x83\xA0\xE3\x81\xB8", "bb\xE3\x83\x9B""bb""\xE3\x83\xBC""bb""\xE3\x83\xA0""bb""\xE3\x81\xB8""bb", 5 }, { "b*", "bb", "\xE3\x83\x9B\r\n\xE3\x83\xBC\r\xE3\x83\xA0\n\xE3\x81\xB8\r\n", // utf8 "bb\xE3\x83\x9B\r\n\xE3\x83\xBC\r\xE3\x83\xA0\n\xE3\x81\xB8\r\n", ("bb\xE3\x83\x9B""bb\r\nbb""\xE3\x83\xBC""bb\rbb""\xE3\x83\xA0" "bb\nbb""\xE3\x81\xB8""bb\r\nbb"), 9 }, #endif { "", NULL, NULL, NULL, NULL, 0 } }; #ifdef SUPPORT_UTF8 const bool support_utf8 = true; #else const bool support_utf8 = false; #endif for (const ReplaceTest *t = tests; t->original != NULL; ++t) { RE re(t->regexp, RE_Options(PCRE_NEWLINE_CRLF).set_utf8(support_utf8)); assert(re.error().empty()); string one(t->original); CHECK(re.Replace(t->rewrite, &one)); CHECK_EQ(one, t->single); string all(t->original); const int replace_count = re.GlobalReplace(t->rewrite, &all); CHECK_EQ(all, t->global); CHECK_EQ(replace_count, t->global_count); } // One final test: test \r\n replacement when we're not in CRLF mode { RE re("b*", RE_Options(PCRE_NEWLINE_CR).set_utf8(support_utf8)); assert(re.error().empty()); string all("aa\r\naa\r\n"); CHECK_EQ(re.GlobalReplace("bb", &all), 9); CHECK_EQ(all, string("bbabbabb\rbb\nbbabbabb\rbb\nbb")); } { RE re("b*", RE_Options(PCRE_NEWLINE_LF).set_utf8(support_utf8)); assert(re.error().empty()); string all("aa\r\naa\r\n"); CHECK_EQ(re.GlobalReplace("bb", &all), 9); CHECK_EQ(all, string("bbabbabb\rbb\nbbabbabb\rbb\nbb")); } // TODO: test what happens when no PCRE_NEWLINE_* flag is set. // Alas, the answer depends on how pcre was compiled. } static void TestExtract() { printf("Testing Extract\n"); string s; CHECK(RE("(.*)@([^.]*)").Extract("\\2!\\1", "boris@kremvax.ru", &s)); CHECK_EQ(s, "kremvax!boris"); // check the RE interface as well CHECK(RE(".*").Extract("'\\0'", "foo", &s)); CHECK_EQ(s, "'foo'"); CHECK(!RE("bar").Extract("'\\0'", "baz", &s)); CHECK_EQ(s, "'foo'"); } static void TestConsume() { printf("Testing Consume\n"); string word; string s(" aaa b!@#$@#$cccc"); StringPiece input(s); RE r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace CHECK(r.Consume(&input, &word)); CHECK_EQ(word, "aaa"); CHECK(r.Consume(&input, &word)); CHECK_EQ(word, "b"); CHECK(! r.Consume(&input, &word)); } static void TestFindAndConsume() { printf("Testing FindAndConsume\n"); string word; string s(" aaa b!@#$@#$cccc"); StringPiece input(s); RE r("(\\w+)"); // matches a word CHECK(r.FindAndConsume(&input, &word)); CHECK_EQ(word, "aaa"); CHECK(r.FindAndConsume(&input, &word)); CHECK_EQ(word, "b"); CHECK(r.FindAndConsume(&input, &word)); CHECK_EQ(word, "cccc"); CHECK(! r.FindAndConsume(&input, &word)); } static void TestMatchNumberPeculiarity() { printf("Testing match-number peculiarity\n"); string word1; string word2; string word3; RE r("(foo)|(bar)|(baz)"); CHECK(r.PartialMatch("foo", &word1, &word2, &word3)); CHECK_EQ(word1, "foo"); CHECK_EQ(word2, ""); CHECK_EQ(word3, ""); CHECK(r.PartialMatch("bar", &word1, &word2, &word3)); CHECK_EQ(word1, ""); CHECK_EQ(word2, "bar"); CHECK_EQ(word3, ""); CHECK(r.PartialMatch("baz", &word1, &word2, &word3)); CHECK_EQ(word1, ""); CHECK_EQ(word2, ""); CHECK_EQ(word3, "baz"); CHECK(!r.PartialMatch("f", &word1, &word2, &word3)); string a; CHECK(RE("(foo)|hello").FullMatch("hello", &a)); CHECK_EQ(a, ""); } static void TestRecursion() { printf("Testing recursion\n"); // Get one string that passes (sometimes), one that never does. string text_good("abcdefghijk"); string text_bad("acdefghijkl"); // According to pcretest, matching text_good against (\w+)*b // requires match_limit of at least 8192, and match_recursion_limit // of at least 37. RE_Options options_ml; options_ml.set_match_limit(8192); RE re("(\\w+)*b", options_ml); CHECK(re.PartialMatch(text_good) == true); CHECK(re.PartialMatch(text_bad) == false); CHECK(re.FullMatch(text_good) == false); CHECK(re.FullMatch(text_bad) == false); options_ml.set_match_limit(1024); RE re2("(\\w+)*b", options_ml); CHECK(re2.PartialMatch(text_good) == false); // because of match_limit CHECK(re2.PartialMatch(text_bad) == false); CHECK(re2.FullMatch(text_good) == false); CHECK(re2.FullMatch(text_bad) == false); RE_Options options_mlr; options_mlr.set_match_limit_recursion(50); RE re3("(\\w+)*b", options_mlr); CHECK(re3.PartialMatch(text_good) == true); CHECK(re3.PartialMatch(text_bad) == false); CHECK(re3.FullMatch(text_good) == false); CHECK(re3.FullMatch(text_bad) == false); options_mlr.set_match_limit_recursion(10); RE re4("(\\w+)*b", options_mlr); CHECK(re4.PartialMatch(text_good) == false); CHECK(re4.PartialMatch(text_bad) == false); CHECK(re4.FullMatch(text_good) == false); CHECK(re4.FullMatch(text_bad) == false); } // A meta-quoted string, interpreted as a pattern, should always match // the original unquoted string. static void TestQuoteMeta(string unquoted, RE_Options options = RE_Options()) { string quoted = RE::QuoteMeta(unquoted); RE re(quoted, options); CHECK(re.FullMatch(unquoted)); } // A string containing meaningful regexp characters, which is then meta- // quoted, should not generally match a string the unquoted string does. static void NegativeTestQuoteMeta(string unquoted, string should_not_match, RE_Options options = RE_Options()) { string quoted = RE::QuoteMeta(unquoted); RE re(quoted, options); CHECK(!re.FullMatch(should_not_match)); } // Tests that quoted meta characters match their original strings, // and that a few things that shouldn't match indeed do not. static void TestQuotaMetaSimple() { TestQuoteMeta("foo"); TestQuoteMeta("foo.bar"); TestQuoteMeta("foo\\.bar"); TestQuoteMeta("[1-9]"); TestQuoteMeta("1.5-2.0?"); TestQuoteMeta("\\d"); TestQuoteMeta("Who doesn't like ice cream?"); TestQuoteMeta("((a|b)c?d*e+[f-h]i)"); TestQuoteMeta("((?!)xxx).*yyy"); TestQuoteMeta("(["); TestQuoteMeta(string("foo\0bar", 7)); } static void TestQuoteMetaSimpleNegative() { NegativeTestQuoteMeta("foo", "bar"); NegativeTestQuoteMeta("...", "bar"); NegativeTestQuoteMeta("\\.", "."); NegativeTestQuoteMeta("\\.", ".."); NegativeTestQuoteMeta("(a)", "a"); NegativeTestQuoteMeta("(a|b)", "a"); NegativeTestQuoteMeta("(a|b)", "(a)"); NegativeTestQuoteMeta("(a|b)", "a|b"); NegativeTestQuoteMeta("[0-9]", "0"); NegativeTestQuoteMeta("[0-9]", "0-9"); NegativeTestQuoteMeta("[0-9]", "[9]"); NegativeTestQuoteMeta("((?!)xxx)", "xxx"); } static void TestQuoteMetaLatin1() { TestQuoteMeta("3\xb2 = 9"); } static void TestQuoteMetaUtf8() { #ifdef SUPPORT_UTF8 TestQuoteMeta("Pl\xc3\xa1\x63ido Domingo", pcrecpp::UTF8()); TestQuoteMeta("xyz", pcrecpp::UTF8()); // No fancy utf8 TestQuoteMeta("\xc2\xb0", pcrecpp::UTF8()); // 2-byte utf8 (degree symbol) TestQuoteMeta("27\xc2\xb0 degrees", pcrecpp::UTF8()); // As a middle character TestQuoteMeta("\xe2\x80\xb3", pcrecpp::UTF8()); // 3-byte utf8 (double prime) TestQuoteMeta("\xf0\x9d\x85\x9f", pcrecpp::UTF8()); // 4-byte utf8 (music note) TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, but should still work NegativeTestQuoteMeta("27\xc2\xb0", // 2-byte utf (degree symbol) "27\\\xc2\\\xb0", pcrecpp::UTF8()); #endif } static void TestQuoteMetaAll() { printf("Testing QuoteMeta\n"); TestQuotaMetaSimple(); TestQuoteMetaSimpleNegative(); TestQuoteMetaLatin1(); TestQuoteMetaUtf8(); } // // Options tests contributed by // Giuseppe Maxia, CTO, Stardata s.r.l. // July 2005 // static void GetOneOptionResult( const char *option_name, const char *regex, const char *str, RE_Options options, bool full, string expected) { printf("Testing Option <%s>\n", option_name); if(VERBOSE_TEST) printf("/%s/ finds \"%s\" within \"%s\" \n", regex, expected.c_str(), str); string captured(""); if (full) RE(regex,options).FullMatch(str, &captured); else RE(regex,options).PartialMatch(str, &captured); CHECK_EQ(captured, expected); } static void TestOneOption( const char *option_name, const char *regex, const char *str, RE_Options options, bool full, bool assertive = true) { printf("Testing Option <%s>\n", option_name); if (VERBOSE_TEST) printf("'%s' %s /%s/ \n", str, (assertive? "matches" : "doesn't match"), regex); if (assertive) { if (full) CHECK(RE(regex,options).FullMatch(str)); else CHECK(RE(regex,options).PartialMatch(str)); } else { if (full) CHECK(!RE(regex,options).FullMatch(str)); else CHECK(!RE(regex,options).PartialMatch(str)); } } static void Test_CASELESS() { RE_Options options; RE_Options options2; options.set_caseless(true); TestOneOption("CASELESS (class)", "HELLO", "hello", options, false); TestOneOption("CASELESS (class2)", "HELLO", "hello", options2.set_caseless(true), false); TestOneOption("CASELESS (class)", "^[A-Z]+$", "Hello", options, false); TestOneOption("CASELESS (function)", "HELLO", "hello", pcrecpp::CASELESS(), false); TestOneOption("CASELESS (function)", "^[A-Z]+$", "Hello", pcrecpp::CASELESS(), false); options.set_caseless(false); TestOneOption("no CASELESS", "HELLO", "hello", options, false, false); } static void Test_MULTILINE() { RE_Options options; RE_Options options2; const char *str = "HELLO\n" "cruel\n" "world\n"; options.set_multiline(true); TestOneOption("MULTILINE (class)", "^cruel$", str, options, false); TestOneOption("MULTILINE (class2)", "^cruel$", str, options2.set_multiline(true), false); TestOneOption("MULTILINE (function)", "^cruel$", str, pcrecpp::MULTILINE(), false); options.set_multiline(false); TestOneOption("no MULTILINE", "^cruel$", str, options, false, false); } static void Test_DOTALL() { RE_Options options; RE_Options options2; const char *str = "HELLO\n" "cruel\n" "world"; options.set_dotall(true); TestOneOption("DOTALL (class)", "HELLO.*world", str, options, true); TestOneOption("DOTALL (class2)", "HELLO.*world", str, options2.set_dotall(true), true); TestOneOption("DOTALL (function)", "HELLO.*world", str, pcrecpp::DOTALL(), true); options.set_dotall(false); TestOneOption("no DOTALL", "HELLO.*world", str, options, true, false); } static void Test_DOLLAR_ENDONLY() { RE_Options options; RE_Options options2; const char *str = "HELLO world\n"; TestOneOption("no DOLLAR_ENDONLY", "world$", str, options, false); options.set_dollar_endonly(true); TestOneOption("DOLLAR_ENDONLY 1", "world$", str, options, false, false); TestOneOption("DOLLAR_ENDONLY 2", "world$", str, options2.set_dollar_endonly(true), false, false); } static void Test_EXTRA() { RE_Options options; const char *str = "HELLO"; options.set_extra(true); TestOneOption("EXTRA 1", "\\HELL\\O", str, options, true, false ); TestOneOption("EXTRA 2", "\\HELL\\O", str, RE_Options().set_extra(true), true, false ); options.set_extra(false); TestOneOption("no EXTRA", "\\HELL\\O", str, options, true ); } static void Test_EXTENDED() { RE_Options options; RE_Options options2; const char *str = "HELLO world"; options.set_extended(true); TestOneOption("EXTENDED (class)", "HELLO world", str, options, false, false); TestOneOption("EXTENDED (class2)", "HELLO world", str, options2.set_extended(true), false, false); TestOneOption("EXTENDED (class)", "^ HE L{2} O " "\\s+ " "\\w+ $ ", str, options, false); TestOneOption("EXTENDED (function)", "HELLO world", str, pcrecpp::EXTENDED(), false, false); TestOneOption("EXTENDED (function)", "^ HE L{2} O " "\\s+ " "\\w+ $ ", str, pcrecpp::EXTENDED(), false); options.set_extended(false); TestOneOption("no EXTENDED", "HELLO world", str, options, false); } static void Test_NO_AUTO_CAPTURE() { RE_Options options; const char *str = "HELLO world"; string captured; printf("Testing Option <no NO_AUTO_CAPTURE>\n"); if (VERBOSE_TEST) printf("parentheses capture text\n"); RE re("(world|universe)$", options); CHECK(re.Extract("\\1", str , &captured)); CHECK_EQ(captured, "world"); options.set_no_auto_capture(true); printf("testing Option <NO_AUTO_CAPTURE>\n"); if (VERBOSE_TEST) printf("parentheses do not capture text\n"); re.Extract("\\1",str, &captured ); CHECK_EQ(captured, "world"); } static void Test_UNGREEDY() { RE_Options options; const char *str = "HELLO, 'this' is the 'world'"; options.set_ungreedy(true); GetOneOptionResult("UNGREEDY 1", "('.*')", str, options, false, "'this'" ); GetOneOptionResult("UNGREEDY 2", "('.*')", str, RE_Options().set_ungreedy(true), false, "'this'" ); GetOneOptionResult("UNGREEDY", "('.*?')", str, options, false, "'this' is the 'world'" ); options.set_ungreedy(false); GetOneOptionResult("no UNGREEDY", "('.*')", str, options, false, "'this' is the 'world'" ); GetOneOptionResult("no UNGREEDY", "('.*?')", str, options, false, "'this'" ); } static void Test_all_options() { const char *str = "HELLO\n" "cruel\n" "world"; RE_Options options; options.set_all_options(PCRE_CASELESS | PCRE_DOTALL); TestOneOption("all_options (CASELESS|DOTALL)", "^hello.*WORLD", str , options, false); options.set_all_options(0); TestOneOption("all_options (0)", "^hello.*WORLD", str , options, false, false); options.set_all_options(PCRE_MULTILINE | PCRE_EXTENDED); TestOneOption("all_options (MULTILINE|EXTENDED)", " ^ c r u e l $ ", str, options, false); TestOneOption("all_options (MULTILINE|EXTENDED) with constructor", " ^ c r u e l $ ", str, RE_Options(PCRE_MULTILINE | PCRE_EXTENDED), false); TestOneOption("all_options (MULTILINE|EXTENDED) with concatenation", " ^ c r u e l $ ", str, RE_Options() .set_multiline(true) .set_extended(true), false); options.set_all_options(0); TestOneOption("all_options (0)", "^ c r u e l $", str, options, false, false); } static void TestOptions() { printf("Testing Options\n"); Test_CASELESS(); Test_MULTILINE(); Test_DOTALL(); Test_DOLLAR_ENDONLY(); Test_EXTENDED(); Test_NO_AUTO_CAPTURE(); Test_UNGREEDY(); Test_EXTRA(); Test_all_options(); } static void TestConstructors() { printf("Testing constructors\n"); RE_Options options; options.set_dotall(true); const char *str = "HELLO\n" "cruel\n" "world"; RE orig("HELLO.*world", options); CHECK(orig.FullMatch(str)); RE copy1(orig); CHECK(copy1.FullMatch(str)); RE copy2("not a match"); CHECK(!copy2.FullMatch(str)); copy2 = copy1; CHECK(copy2.FullMatch(str)); copy2 = orig; CHECK(copy2.FullMatch(str)); // Make sure when we assign to ourselves, nothing bad happens orig = orig; copy1 = copy1; copy2 = copy2; CHECK(orig.FullMatch(str)); CHECK(copy1.FullMatch(str)); CHECK(copy2.FullMatch(str)); } int main(int argc, char** argv) { // Treat any flag as --help if (argc > 1 && argv[1][0] == '-') { printf("Usage: %s [timing1|timing2|timing3 num-iters]\n" " If 'timingX ###' is specified, run the given timing test\n" " with the given number of iterations, rather than running\n" " the default corectness test.\n", argv[0]); return 0; } if (argc > 1) { if ( argc == 2 || atoi(argv[2]) == 0) { printf("timing mode needs a num-iters argument\n"); return 1; } if (!strcmp(argv[1], "timing1")) Timing1(atoi(argv[2])); else if (!strcmp(argv[1], "timing2")) Timing2(atoi(argv[2])); else if (!strcmp(argv[1], "timing3")) Timing3(atoi(argv[2])); else printf("Unknown argument '%s'\n", argv[1]); return 0; } printf("PCRE C++ wrapper tests\n"); printf("Testing FullMatch\n"); int i; string s; /***** FullMatch with no args *****/ CHECK(RE("h.*o").FullMatch("hello")); CHECK(!RE("h.*o").FullMatch("othello")); // Must be anchored at front CHECK(!RE("h.*o").FullMatch("hello!")); // Must be anchored at end CHECK(RE("a*").FullMatch("aaaa")); // Fullmatch with normal op CHECK(RE("a*?").FullMatch("aaaa")); // Fullmatch with nongreedy op CHECK(RE("a*?\\z").FullMatch("aaaa")); // Two unusual ops /***** FullMatch with args *****/ // Zero-arg CHECK(RE("\\d+").FullMatch("1001")); // Single-arg CHECK(RE("(\\d+)").FullMatch("1001", &i)); CHECK_EQ(i, 1001); CHECK(RE("(-?\\d+)").FullMatch("-123", &i)); CHECK_EQ(i, -123); CHECK(!RE("()\\d+").FullMatch("10", &i)); CHECK(!RE("(\\d+)").FullMatch("1234567890123456789012345678901234567890", &i)); // Digits surrounding integer-arg CHECK(RE("1(\\d*)4").FullMatch("1234", &i)); CHECK_EQ(i, 23); CHECK(RE("(\\d)\\d+").FullMatch("1234", &i)); CHECK_EQ(i, 1); CHECK(RE("(-\\d)\\d+").FullMatch("-1234", &i)); CHECK_EQ(i, -1); CHECK(RE("(\\d)").PartialMatch("1234", &i)); CHECK_EQ(i, 1); CHECK(RE("(-\\d)").PartialMatch("-1234", &i)); CHECK_EQ(i, -1); // String-arg CHECK(RE("h(.*)o").FullMatch("hello", &s)); CHECK_EQ(s, string("ell")); // StringPiece-arg StringPiece sp; CHECK(RE("(\\w+):(\\d+)").FullMatch("ruby:1234", &sp, &i)); CHECK_EQ(sp.size(), 4); CHECK(memcmp(sp.data(), "ruby", 4) == 0); CHECK_EQ(i, 1234); // Multi-arg CHECK(RE("(\\w+):(\\d+)").FullMatch("ruby:1234", &s, &i)); CHECK_EQ(s, string("ruby")); CHECK_EQ(i, 1234); // Ignore non-void* NULL arg CHECK(RE("he(.*)lo").FullMatch("hello", (char*)NULL)); CHECK(RE("h(.*)o").FullMatch("hello", (string*)NULL)); CHECK(RE("h(.*)o").FullMatch("hello", (StringPiece*)NULL)); CHECK(RE("(.*)").FullMatch("1234", (int*)NULL)); #ifdef HAVE_LONG_LONG CHECK(RE("(.*)").FullMatch("1234567890123456", (long long*)NULL)); #endif CHECK(RE("(.*)").FullMatch("123.4567890123456", (double*)NULL)); CHECK(RE("(.*)").FullMatch("123.4567890123456", (float*)NULL)); // Fail on non-void* NULL arg if the match doesn't parse for the given type. CHECK(!RE("h(.*)lo").FullMatch("hello", &s, (char*)NULL)); CHECK(!RE("(.*)").FullMatch("hello", (int*)NULL)); CHECK(!RE("(.*)").FullMatch("1234567890123456", (int*)NULL)); CHECK(!RE("(.*)").FullMatch("hello", (double*)NULL)); CHECK(!RE("(.*)").FullMatch("hello", (float*)NULL)); // Ignored arg CHECK(RE("(\\w+)(:)(\\d+)").FullMatch("ruby:1234", &s, (void*)NULL, &i)); CHECK_EQ(s, string("ruby")); CHECK_EQ(i, 1234); // Type tests { char c; CHECK(RE("(H)ello").FullMatch("Hello", &c)); CHECK_EQ(c, 'H'); } { unsigned char c; CHECK(RE("(H)ello").FullMatch("Hello", &c)); CHECK_EQ(c, static_cast<unsigned char>('H')); } { short v; CHECK(RE("(-?\\d+)").FullMatch("100", &v)); CHECK_EQ(v, 100); CHECK(RE("(-?\\d+)").FullMatch("-100", &v)); CHECK_EQ(v, -100); CHECK(RE("(-?\\d+)").FullMatch("32767", &v)); CHECK_EQ(v, 32767); CHECK(RE("(-?\\d+)").FullMatch("-32768", &v)); CHECK_EQ(v, -32768); CHECK(!RE("(-?\\d+)").FullMatch("-32769", &v)); CHECK(!RE("(-?\\d+)").FullMatch("32768", &v)); } { unsigned short v; CHECK(RE("(\\d+)").FullMatch("100", &v)); CHECK_EQ(v, 100); CHECK(RE("(\\d+)").FullMatch("32767", &v)); CHECK_EQ(v, 32767); CHECK(RE("(\\d+)").FullMatch("65535", &v)); CHECK_EQ(v, 65535); CHECK(!RE("(\\d+)").FullMatch("65536", &v)); } { int v; static const int max_value = 0x7fffffff; static const int min_value = -max_value - 1; CHECK(RE("(-?\\d+)").FullMatch("100", &v)); CHECK_EQ(v, 100); CHECK(RE("(-?\\d+)").FullMatch("-100", &v)); CHECK_EQ(v, -100); CHECK(RE("(-?\\d+)").FullMatch("2147483647", &v)); CHECK_EQ(v, max_value); CHECK(RE("(-?\\d+)").FullMatch("-2147483648", &v)); CHECK_EQ(v, min_value); CHECK(!RE("(-?\\d+)").FullMatch("-2147483649", &v)); CHECK(!RE("(-?\\d+)").FullMatch("2147483648", &v)); } { unsigned int v; static const unsigned int max_value = 0xfffffffful; CHECK(RE("(\\d+)").FullMatch("100", &v)); CHECK_EQ(v, 100); CHECK(RE("(\\d+)").FullMatch("4294967295", &v)); CHECK_EQ(v, max_value); CHECK(!RE("(\\d+)").FullMatch("4294967296", &v)); } #ifdef HAVE_LONG_LONG # if defined(__MINGW__) || defined(__MINGW32__) # define LLD "%I64d" # define LLU "%I64u" # else # define LLD "%lld" # define LLU "%llu" # endif { long long v; static const long long max_value = 0x7fffffffffffffffLL; static const long long min_value = -max_value - 1; char buf[32]; // definitely big enough for a long long CHECK(RE("(-?\\d+)").FullMatch("100", &v)); CHECK_EQ(v, 100); CHECK(RE("(-?\\d+)").FullMatch("-100",&v)); CHECK_EQ(v, -100); sprintf(buf, LLD, max_value); CHECK(RE("(-?\\d+)").FullMatch(buf,&v)); CHECK_EQ(v, max_value); sprintf(buf, LLD, min_value); CHECK(RE("(-?\\d+)").FullMatch(buf,&v)); CHECK_EQ(v, min_value); sprintf(buf, LLD, max_value); assert(buf[strlen(buf)-1] != '9'); buf[strlen(buf)-1]++; CHECK(!RE("(-?\\d+)").FullMatch(buf, &v)); sprintf(buf, LLD, min_value); assert(buf[strlen(buf)-1] != '9'); buf[strlen(buf)-1]++; CHECK(!RE("(-?\\d+)").FullMatch(buf, &v)); } #endif #if defined HAVE_UNSIGNED_LONG_LONG && defined HAVE_LONG_LONG { unsigned long long v; long long v2; static const unsigned long long max_value = 0xffffffffffffffffULL; char buf[32]; // definitely big enough for a unsigned long long CHECK(RE("(-?\\d+)").FullMatch("100",&v)); CHECK_EQ(v, 100); CHECK(RE("(-?\\d+)").FullMatch("-100",&v2)); CHECK_EQ(v2, -100); sprintf(buf, LLU, max_value); CHECK(RE("(-?\\d+)").FullMatch(buf,&v)); CHECK_EQ(v, max_value); assert(buf[strlen(buf)-1] != '9'); buf[strlen(buf)-1]++; CHECK(!RE("(-?\\d+)").FullMatch(buf, &v)); } #endif { float v; CHECK(RE("(.*)").FullMatch("100", &v)); CHECK(RE("(.*)").FullMatch("-100.", &v)); CHECK(RE("(.*)").FullMatch("1e23", &v)); } { double v; CHECK(RE("(.*)").FullMatch("100", &v)); CHECK(RE("(.*)").FullMatch("-100.", &v)); CHECK(RE("(.*)").FullMatch("1e23", &v)); } // Check that matching is fully anchored CHECK(!RE("(\\d+)").FullMatch("x1001", &i)); CHECK(!RE("(\\d+)").FullMatch("1001x", &i)); CHECK(RE("x(\\d+)").FullMatch("x1001", &i)); CHECK_EQ(i, 1001); CHECK(RE("(\\d+)x").FullMatch("1001x", &i)); CHECK_EQ(i, 1001); // Braces CHECK(RE("[0-9a-f+.-]{5,}").FullMatch("0abcd")); CHECK(RE("[0-9a-f+.-]{5,}").FullMatch("0abcde")); CHECK(!RE("[0-9a-f+.-]{5,}").FullMatch("0abc")); // Complicated RE CHECK(RE("foo|bar|[A-Z]").FullMatch("foo")); CHECK(RE("foo|bar|[A-Z]").FullMatch("bar")); CHECK(RE("foo|bar|[A-Z]").FullMatch("X")); CHECK(!RE("foo|bar|[A-Z]").FullMatch("XY")); // Check full-match handling (needs '$' tacked on internally) CHECK(RE("fo|foo").FullMatch("fo")); CHECK(RE("fo|foo").FullMatch("foo")); CHECK(RE("fo|foo$").FullMatch("fo")); CHECK(RE("fo|foo$").FullMatch("foo")); CHECK(RE("foo$").FullMatch("foo")); CHECK(!RE("foo\\$").FullMatch("foo$bar")); CHECK(!RE("fo|bar").FullMatch("fox")); // Uncomment the following if we change the handling of '$' to // prevent it from matching a trailing newline if (false) { // Check that we don't get bitten by pcre's special handling of a // '\n' at the end of the string matching '$' CHECK(!RE("foo$").PartialMatch("foo\n")); } // Number of args int a[16]; CHECK(RE("").FullMatch("")); memset(a, 0, sizeof(0)); CHECK(RE("(\\d){1}").FullMatch("1", &a[0])); CHECK_EQ(a[0], 1); memset(a, 0, sizeof(0)); CHECK(RE("(\\d)(\\d)").FullMatch("12", &a[0], &a[1])); CHECK_EQ(a[0], 1); CHECK_EQ(a[1], 2); memset(a, 0, sizeof(0)); CHECK(RE("(\\d)(\\d)(\\d)").FullMatch("123", &a[0], &a[1], &a[2])); CHECK_EQ(a[0], 1); CHECK_EQ(a[1], 2); CHECK_EQ(a[2], 3); memset(a, 0, sizeof(0)); CHECK(RE("(\\d)(\\d)(\\d)(\\d)").FullMatch("1234", &a[0], &a[1], &a[2], &a[3])); CHECK_EQ(a[0], 1); CHECK_EQ(a[1], 2); CHECK_EQ(a[2], 3); CHECK_EQ(a[3], 4); memset(a, 0, sizeof(0)); CHECK(RE("(\\d)(\\d)(\\d)(\\d)(\\d)").FullMatch("12345", &a[0], &a[1], &a[2], &a[3], &a[4])); CHECK_EQ(a[0], 1); CHECK_EQ(a[1], 2); CHECK_EQ(a[2], 3); CHECK_EQ(a[3], 4); CHECK_EQ(a[4], 5); memset(a, 0, sizeof(0)); CHECK(RE("(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)").FullMatch("123456", &a[0], &a[1], &a[2], &a[3], &a[4], &a[5])); CHECK_EQ(a[0], 1); CHECK_EQ(a[1], 2); CHECK_EQ(a[2], 3); CHECK_EQ(a[3], 4); CHECK_EQ(a[4], 5); CHECK_EQ(a[5], 6); memset(a, 0, sizeof(0)); CHECK(RE("(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)").FullMatch("1234567", &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6])); CHECK_EQ(a[0], 1); CHECK_EQ(a[1], 2); CHECK_EQ(a[2], 3); CHECK_EQ(a[3], 4); CHECK_EQ(a[4], 5); CHECK_EQ(a[5], 6); CHECK_EQ(a[6], 7); memset(a, 0, sizeof(0)); CHECK(RE("(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)" "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)").FullMatch( "1234567890123456", &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7], &a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15])); CHECK_EQ(a[0], 1); CHECK_EQ(a[1], 2); CHECK_EQ(a[2], 3); CHECK_EQ(a[3], 4); CHECK_EQ(a[4], 5); CHECK_EQ(a[5], 6); CHECK_EQ(a[6], 7); CHECK_EQ(a[7], 8); CHECK_EQ(a[8], 9); CHECK_EQ(a[9], 0); CHECK_EQ(a[10], 1); CHECK_EQ(a[11], 2); CHECK_EQ(a[12], 3); CHECK_EQ(a[13], 4); CHECK_EQ(a[14], 5); CHECK_EQ(a[15], 6); /***** PartialMatch *****/ printf("Testing PartialMatch\n"); CHECK(RE("h.*o").PartialMatch("hello")); CHECK(RE("h.*o").PartialMatch("othello")); CHECK(RE("h.*o").PartialMatch("hello!")); CHECK(RE("((((((((((((((((((((x))))))))))))))))))))").PartialMatch("x")); /***** other tests *****/ RadixTests(); TestReplace(); TestExtract(); TestConsume(); TestFindAndConsume(); TestQuoteMetaAll(); TestMatchNumberPeculiarity(); // Check the pattern() accessor { const string kPattern = "http://([^/]+)/.*"; const RE re(kPattern); CHECK_EQ(kPattern, re.pattern()); } // Check RE error field. { RE re("foo"); CHECK(re.error().empty()); // Must have no error } #ifdef SUPPORT_UTF8 // Check UTF-8 handling { printf("Testing UTF-8 handling\n"); // Three Japanese characters (nihongo) const unsigned char utf8_string[] = { 0xe6, 0x97, 0xa5, // 65e5 0xe6, 0x9c, 0xac, // 627c 0xe8, 0xaa, 0x9e, // 8a9e 0 }; const unsigned char utf8_pattern[] = { '.', 0xe6, 0x9c, 0xac, // 627c '.', 0 }; // Both should match in either mode, bytes or UTF-8 RE re_test1("........."); CHECK(re_test1.FullMatch(utf8_string)); RE re_test2("...", pcrecpp::UTF8()); CHECK(re_test2.FullMatch(utf8_string)); // Check that '.' matches one byte or UTF-8 character // according to the mode. string ss; RE re_test3("(.)"); CHECK(re_test3.PartialMatch(utf8_string, &ss)); CHECK_EQ(ss, string("\xe6")); RE re_test4("(.)", pcrecpp::UTF8()); CHECK(re_test4.PartialMatch(utf8_string, &ss)); CHECK_EQ(ss, string("\xe6\x97\xa5")); // Check that string matches itself in either mode RE re_test5(utf8_string); CHECK(re_test5.FullMatch(utf8_string)); RE re_test6(utf8_string, pcrecpp::UTF8()); CHECK(re_test6.FullMatch(utf8_string)); // Check that pattern matches string only in UTF8 mode RE re_test7(utf8_pattern); CHECK(!re_test7.FullMatch(utf8_string)); RE re_test8(utf8_pattern, pcrecpp::UTF8()); CHECK(re_test8.FullMatch(utf8_string)); } // Check that ungreedy, UTF8 regular expressions don't match when they // oughtn't -- see bug 82246. { // This code always worked. const char* pattern = "\\w+X"; const string target = "a aX"; RE match_sentence(pattern); RE match_sentence_re(pattern, pcrecpp::UTF8()); CHECK(!match_sentence.FullMatch(target)); CHECK(!match_sentence_re.FullMatch(target)); } { const char* pattern = "(?U)\\w+X"; const string target = "a aX"; RE match_sentence(pattern); RE match_sentence_re(pattern, pcrecpp::UTF8()); CHECK(!match_sentence.FullMatch(target)); CHECK(!match_sentence_re.FullMatch(target)); } #endif /* def SUPPORT_UTF8 */ printf("Testing error reporting\n"); { RE re("a\\1"); CHECK(!re.error().empty()); } { RE re("a[x"); CHECK(!re.error().empty()); } { RE re("a[z-a]"); CHECK(!re.error().empty()); } { RE re("a[[:foobar:]]"); CHECK(!re.error().empty()); } { RE re("a(b"); CHECK(!re.error().empty()); } { RE re("a\\"); CHECK(!re.error().empty()); } // Test that recursion is stopped TestRecursion(); // Test Options if (getenv("VERBOSE_TEST") != NULL) VERBOSE_TEST = true; TestOptions(); // Test the constructors TestConstructors(); // Done printf("OK\n"); return 0; }