/* ****************************************************************************** * Copyright (C) 2005-2007, International Business Machines Corporation and * * others. All Rights Reserved. * ****************************************************************************** */ #include <stdio.h> #include <string.h> #include <stdlib.h> #include <time.h> #include "wbnf.h" // Most of this code is meant to test the test code. It's a self test. // Normally this isn't run. #define TEST_WBNF_TEST 0 /////////////////////////////////////////////////////////// // // Constants and the most basic helper classes // static const char DIGIT_CHAR[] = "0123456789"; static const char WHITE_SPACE[] = {'\t', ' ', '\r', '\n', 0}; static const char ALPHABET[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; static const char SPECIAL[] = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; static inline UBool isInList(const char c /*in*/, const char list[] /*in*/){ const char * p = list; for (;*p != 0 && *p != c; p++); return *p?TRUE:FALSE; } static inline UBool isDigit(char c) {return isInList(c, DIGIT_CHAR);} static inline UBool isWhiteSpace(char c) {return isInList(c, WHITE_SPACE);} static inline UBool isAlphabet(char c) {return isInList(c, ALPHABET);} static inline UBool isSpecialAsciiChar(char c) {return isInList(c,SPECIAL);} /////////////////////////////////////////////////////////// // // Helper classes // class Buffer_byte{ // Utility class, can be treated as an auto expanded array. no boundary check. typedef char byte; byte * start; byte * current; int buffer_size; // size unit is byte public: inline int content_size(){return current - start;} // size unit is byte private: inline void expand(int add_size = 100){ // size unit is byte int new_size = buffer_size + add_size; int cs_snap = content_size(); start = (byte *) realloc(start, new_size); // may change the value of start current = start + cs_snap; memset(current, 0, add_size); buffer_size = new_size; } inline void expand_to(int size){ int r = size - buffer_size; if (r > 0) { expand(r); // simply expand, no block alignment } } Buffer_byte(const Buffer_byte &); Buffer_byte & operator = (const Buffer_byte &); public: Buffer_byte():start(NULL),current(start),buffer_size(0){ expand(); } ~Buffer_byte(){ free(start); } inline void reset(){ start != NULL ? memset(start, 0, buffer_size) : 0; current = start; } // Using memory copy method to append a C array to buffer, inline void append(const void * c, int size){ // size unit is byte expand_to(content_size() + size) ; memcpy(current, c, size); current = current + size; } byte * buffer(){ return start; } }; /* The class(es) try to work as bulid-in array, so it overloads these two operators operator type *(); type & operator[]; The first is used to auto type convert, the latter is used to select member. A small trick is the class does not overload the address-of operator. This behavior is different from bulid-in array, but it give us the opportunity to get the address of the class itself. */ //template<typename type> // class BUFFER{ // typedef BUFFER name; #define BUFFER(type, name)\ class name {\ private:\ Buffer_byte buf;\ public:\ name & reset() {buf.reset(); return *this;}\ name & append(type c) {buf.append(&c, sizeof(type)); return *this;}\ name & append_array(const type * p, int size) {buf.append(p, sizeof(type)*size); return *this;}\ type & operator [] (int i) { return ((type *) buf.buffer())[i];}\ operator type *(){return (type *) buf.buffer();} \ int content_size(){return buf.content_size() / sizeof(type);}\ } class Pick{ /* The Pick is the basic language generator element*/ public: // generate a string accroding the syntax // Return a null-terminated c-string. The buffer is owned by callee. virtual const char* next() = 0; virtual ~Pick(){}; }; //typedef BUFFER<char> Buffer_char; //typedef BUFFER<int> Buffer_int; //typedef BUFFER<Pick *> Buffer_pPick; BUFFER(char, Buffer_char); BUFFER(int, Buffer_int); BUFFER(Pick *, Buffer_pPick); class SymbolTable{ /* Helper class. * It's a mapping table between 'variable name' and its 'active Pick object' */ private: Buffer_char name_buffer; // var names storage space Buffer_int names; // points to name (offset in name_buffer) Buffer_pPick refs; // points to Pick int get_index(const char *const var_name){ int len = names.content_size(); for (int i=0; i< len; i++){ if (strcmp(var_name, name_buffer + names[i]) == 0){ return i; } } return -1; } public: enum RESULT {EMPTY, NO_VAR, NO_REF, HAS_REF}; RESULT find(const char *const var_name /*[in] c-string*/, Pick * * ref = NULL /*[out] Pick* */){ if (!var_name) return EMPTY; // NULL name int i = get_index(var_name); if (i == -1){ return NO_VAR; // new name } if (!refs[i]){ // exist name, no ref return NO_REF; } else { if (ref) { *ref = refs[i]; } return HAS_REF; // exist name, has ref } } void put(const char *const var_name, Pick *const var_ref = NULL){ int i = get_index(var_name); switch(find(var_name)){ case EMPTY: // NULL name break; case NO_VAR: // new name int offset; offset = name_buffer.content_size(); name_buffer.append_array(var_name, strlen(var_name) + 1); names.append(offset); refs.append(var_ref); break; case NO_REF: // exist name, no ref refs[i] = var_ref; // link definition with variable break; case HAS_REF: // exist name, has ref if (var_ref){ refs[i] = var_ref; } break; default: ; // ASSERT(FALSE); } return; } UBool is_complete(){ int n = names.content_size(); for (int i=0; i<n; ++i){ if (refs[i] == NULL){ return FALSE; } } return TRUE; } void reset(){ names.reset(); name_buffer.reset(); // release memory here int s = refs.content_size(); for (int i=0; i < s; i++){ delete refs[i]; // TOFIX: point alias/recursion problem } refs.reset(); } ~SymbolTable(){ reset(); } }; /* // Document of class Escaper // // ATTENTION: // From http://icu-project.org/userguide/Collate_Customization.html. // We get the precedence of escape/quote operations // // (highest) 1. backslash \ // 2. two single quotes '' // 3. quoting ' ' // // ICU Collation should accept following as the same string. // // 1) 'ab'c _ // 2) a\bc \ // 3) a'b'\c |- They are equal. // 4) abc _/ // // From "two single quotes", we have following deductions // D1. empty quoting is illgal. (obviously) // D2. no contact operation between two quotings // '.''.' is not .. it is .'. // D3. "two single quotes" cannot contact two quoting simultaneously // '..''''.' is not ..'. it is ..''. // NOTICE: // "two single quotes" can contact before one quoting // '''.' is '. // "two single quotes" can literally contact after one quoting // But, from syntax, it's one quoting including a "two single quotes" // '.''' is .' // D4. "two single quotes" cannot solely be included in quoting // '''' is not ' it is '' // NOTICE: These are legal // '.''.' is .'. // '.''' is .' // // dicision // /\ // /__\ // output buffer input buffer // // To make our dicision (within an atom operation) without caring input and output buffer, // following calling pattern (within an atom operation) shall be avoided // // P1 open_quoting() then close_quoting() (direct violation) D1 // P2 close_quoting() then open_quoting() (direct violation) D2 // P3 empty open_quoting() (indirect violation) D1, D4 // P4 empty close_quoting() (indirect violation) D2, D3 // P5 open_quoting() then two single quotes (indirect violation) D4 // P6 close_quoting() then two single quotes (indirect violation) D3 // // two single quotes escaping will not open_ or close_ quoting() // The choice will not lose some quoing forms. // // For open_quoting(), // we may get this form quoting ''' P5 // It may raise a bug ''''x // If we expect // '''.' let the next char open the quoting // '.''.' the quoting is already opened by preceding char // // For close_quoting() // we will get this form quoting '.''' P6 // It may raise a bug '.''''.' // If we expect // '.'''\. let the next char close the quoting // '.''''.' the expectation is wrong! using '.'\''.' instead // // It's a hard work to re-adjust generation opportunity for various escaping form. // We just simply ignore it. */ class Escaper{ public: enum CHOICE {YES, NO, RAND}; enum ESCAPE_FORM {BSLASH_ONLY, QUOTE_ONLY, QUOTE_AND_BSLAH, RAND_ESC}; private: class Bool{ // A wrapper class for CHOICE, to auto adapter UBool class private: const CHOICE tag; public: Bool(CHOICE flag=RAND):tag(flag){} operator UBool() { // conversion operator return tag == RAND ? rand()%2 : tag == YES; //if (tag == RAND){ // return rand()%2 == 1; //} else { // return tag == YES ? TRUE : FALSE; //} } }; public: Escaper(CHOICE escapeLiteral = RAND, CHOICE twoQuotesEscape = RAND, ESCAPE_FORM escapeForm = RAND_ESC): escape_form(escapeForm), escape_literal(escapeLiteral), two_quotes_escape(twoQuotesEscape), is_quoting(FALSE){} private: Buffer_char str; ESCAPE_FORM escape_form; Bool escape_literal; Bool two_quotes_escape; UBool quote_escape; UBool bslash_escape; UBool is_quoting; void set_options(){ ESCAPE_FORM t = escape_form == RAND_ESC ? (ESCAPE_FORM) (rand()%3) : escape_form; switch (t){ case BSLASH_ONLY : bslash_escape = TRUE; quote_escape = FALSE; break; case QUOTE_ONLY: bslash_escape = FALSE;quote_escape = TRUE; break; case QUOTE_AND_BSLAH: bslash_escape = TRUE; quote_escape = TRUE; break; default: ;// error } } void reset(){ str.reset(); is_quoting = FALSE; } inline void open_quoting(){ if(is_quoting){ // do nothing } else { str.append('\''); is_quoting = TRUE; } } inline void close_quoting(){ if(is_quoting){ str.append('\''); is_quoting = FALSE; } else { // do nothing } } // str [in] null-terminated c-string void append(const char * strToAppend){ for(;*strToAppend != 0; strToAppend++){ append(*strToAppend); } } inline void append(const char c){ set_options(); if (c == '\\'){ quote_escape ? open_quoting() : close_quoting(); //bslash_escape always true here str.append('\\'); str.append('\\'); } else if (c == '\''){ if (two_quotes_escape){ // quoted using two single quotes // See documents in anonymous.design str.append('\''); str.append('\''); } else{ quote_escape ? open_quoting() : close_quoting(); //bslash_escape always true here str.append('\\'); str.append('\''); } } else if (isSpecialAsciiChar(c) || isWhiteSpace(c)){ quote_escape ? open_quoting() : close_quoting(); if (bslash_escape) str.append('\\'); str.append(c); } else { //if (isAlphabet(c) || isDigit(c) || TRUE){ // treat others as literal if (escape_literal){ quote_escape ? open_quoting() : close_quoting(); if (bslash_escape) str.append('\\'); str.append(c); } else { close_quoting(); str.append(c); } } } public: // Return a null-terminate c-string. The buffer is owned by callee. char * operator()(const char * literal /*c-string*/){ str.reset(); for(;*literal != 0; literal++){ append(*literal); } close_quoting(); // P4 exception, to close whole quoting return str; } }; class WeightedRand{ // Return a random number in [0, size) // Every number has different chance (aka weight) to be selected. private: Buffer_int weights; double total; WeightedRand(const WeightedRand &); WeightedRand & operator = (const WeightedRand &); public: WeightedRand(Buffer_int * weight_list = NULL, int size = 0){ if ( weight_list == NULL){ for (int i=0; i<size; ++i) weights.append(DEFAULT_WEIGHT); } else { int s = weight_list->content_size(); if (s < size){ weights.append_array( (*weight_list),s); for (int i=s; i<size; ++i) weights.append(DEFAULT_WEIGHT); } else { // s >= size weights.append_array( (*weight_list),size); } } total = 0; int c = weights.content_size(); for (int i=0; i<c; ++i){ total += weights[i]; } } void append(int weight){ weights.append(weight); total += weight; } // Give a random number with the consideration of weight. // Every random number is associated with a weight. // It identifies the chance to be selected, // larger weight has more chance to be selected. // // // ______________________ every slot has equal chance // // [____][_][___][______] each item has different slots, hence different chance // // // The algorithms to generate the number is illustrated by preceding figure. // First, a slot is selected by rand(). Then we translate the slot to corresponding item. // int next(){ // get a random in [0,1] double reference_mark = (double)rand() / (double)RAND_MAX; // get the slot's index, 0 <= mark <= total; double mark = total * reference_mark; // translate the slot to corresponding item int i=0; for (;;){ mark -= weights[i]; // 0 <= mark <= total if (mark <= 0) break; i++; } return i; } }; /////////////////////////////////////////////////////////// // // The parser result nodes // class Literal : public Pick { public: virtual const char* next(){ return str; } Literal(const char * s /*c-string*/){ str.append_array(s, strlen(s) + 1); } private: Buffer_char str; //null-terminated c-string }; class Variable : public Pick { public: Variable(SymbolTable * symbols, const char * varName, Pick * varRef = NULL){ this->var_name.append_array(varName, strlen(varName) + 1); if ((symbol_table = symbols)){ symbol_table->put(varName, varRef); } } operator const char *(){ return var_name; } virtual const char* next(){ if (symbol_table){ Pick * var_ref = NULL; symbol_table->find(var_name, &var_ref); if (var_ref) { return var_ref->next(); } } return ""; // dumb string } private: Buffer_char var_name; SymbolTable * symbol_table; }; class Quote : public Pick{ public: Quote(Pick & base):item(base),e(Escaper::NO, Escaper::NO, Escaper::BSLASH_ONLY){ } virtual const char* next(){ return e(item.next()); } private: Pick & item; Buffer_char str; Escaper e; }; class Morph : public Pick{ /* The difference between morph and an arbitrary random string is that a morph changes slowly. When we build collation rules, for example, it is a much better test if the strings we use are all in the same 'neighborhood'; they share many common characters. */ public: Morph(Pick & base):item(base){} virtual const char* next(){ current.reset(); const char * s = item.next(); current.append_array(s, strlen(s) + 1); if (last.content_size() == 0) { str.reset(); last.reset(); str.append_array(current, current.content_size()); last.append_array(current, current.content_size()); } else { morph(); } return str; } private: Pick & item; Buffer_char str; Buffer_char last; Buffer_char current; char * p_last; char * p_curr; void copy_curr(){ if (*p_curr) { str.append(*p_curr); p_curr++; } } void copy_last(){ if (*p_last) { str.append(*p_last); p_last++; } } // copy 0, 1, or 2 character(s) to str void copy(){ static WeightedRand wr(& Buffer_int().append(DEFAULT_WEIGHT * 10), 5); switch (wr.next()){ case 0: // copy last -- has 10 times chance than others copy_last(); break; case 1: // copy both copy_curr(); copy_last(); break; case 2: // copy both copy_last(); copy_curr(); break; case 3: copy_curr(); break; case 4: // copy nothing break; default: // ASSERT(FALSE); ; } } void morph(void){ int min = strlen(last); int max = strlen(current); if (min > max){ int temp = min; min = max; max = temp; } int len = min + rand()%(max - min + 1); // min + [0, diff] p_curr = current; p_last = last; str.reset(); for (; str.content_size()<len && *p_curr && *p_last;){ copy(); // copy 0, 1, or 2 character(s) to str } if (str.content_size() == len) { str.append(0); final(); return; } if (str.content_size() > len) { // if the last copy copied two characters str[len]=0; final(); return; } // str.content_size() < len if (*p_last) { for (; str.content_size() < len; copy_last()); } else if (*p_curr){ for (; str.content_size() < len; copy_curr()); } int last_len = last.content_size(); for (;str.content_size() < len;){ str.append(last[rand()%last_len]); } str.append(0); final(); } void final(){ last.reset(); last.append_array(current, current.content_size()); } }; class Sequence : public Pick { public: virtual const char* next(){ str.reset(); int s = items.content_size(); for(int i=0; i < s; i++){ const char * t = items[i]->next(); str.append_array(t, strlen(t)); } str.append(0); // terminal null return str; } void append (Pick * node){ items.append(node); } virtual ~Sequence(){ int s = items.content_size(); for(int i=0; i < s; i++){ //How can assure the item is got from heap? //Let's assume it. delete items[i]; // TOFIX: point alias/recursion problem items[i] = NULL; } } private: Buffer_pPick items; Buffer_char str; //null-terminated c-string }; class Repeat : public Pick { private: Pick * item; Buffer_char str; WeightedRand wr; int min; int max; int select_a_count(){ return min + wr.next(); } public: virtual const char* next(){ str.reset(); int c = select_a_count(); for(int i=0; i< c; i++){ const char * t = item->next(); str.append_array(t, strlen(t)); } str.append(0); return str; } Repeat(Pick * base, int minCount =0, int maxCount = 1, Buffer_int * weights = NULL): wr(weights, maxCount-minCount +1) { this->item = base; this->min = minCount; this->max = maxCount; } virtual ~Repeat(){ delete item; // TOFIX: point alias/recursion problem item = NULL; } }; class Alternation : public Pick { public: virtual const char* next(){ str.reset(); int i = wr.next(); const char * t = items[i]->next(); str.append_array(t, strlen(t) + 1); return str; } virtual ~Alternation(){ int s = items.content_size(); for(int i=0; i < s; i++){ delete items[i]; // TOFIX: point alias/recursion problem items[i] = NULL; } } Alternation & append (Pick * node, int weight = DEFAULT_WEIGHT){ items.append(node); wr.append(weight); return *this; } private: Buffer_pPick items; Buffer_char str; // null-terminated c-string WeightedRand wr; }; /////////////////////////////////////////////////////////// // // The parser // enum TokenType {STRING, VAR, NUMBER, STREAM_END, ERROR, QUESTION, STAR, PLUS, LBRACE, RBRACE, LPAR, RPAR, SEMI, EQ, COMMA, BAR, AT, WAVE, PERCENT}; class Scanner{ friend int DumpScanner(Scanner & s, UBool dumb); private: const char * source; const char * working; const char * history; // for debug enum StateType {START, IN_NUM, IN_VAR_FIRST, IN_VAR, IN_QUOTE, IN_QUOTE_BSLASH, IN_BSLASH, IN_STRING, DONE}; StateType state; void terminated(TokenType t){ working--; // return the peeked character tokenType = t; token.append(0); // close buffer state = DONE; } public: // the buffer of "source" is owned by caller Scanner(const char *src/*[in] c-string*/ = NULL):source(src){ working = src; history = working; state = DONE; tokenType = ERROR; } //void setSource(const char *const src /*[in] c-string*/){ // *(&const_cast<const char *>(source)) = src; //} Buffer_char token; TokenType tokenType; TokenType getNextToken(){ token.reset(); state = START; history = working; // for debug while (state != DONE){ char c = *working++; if (c == 0 && state != START){//avoid buffer overflow. for IN_QUOE, IN_ESCAPE terminated(ERROR); break; // while } switch(state){ case START: tokenType = ERROR; switch(c){ case '?' : tokenType = QUESTION; break; case '*' : tokenType = STAR; break; case '+' : tokenType = PLUS; break; case '{' : tokenType = LBRACE; break; case '}' : tokenType = RBRACE; break; case '(' : tokenType = LPAR; break; case ')' : tokenType = RPAR; break; case ';' : tokenType = SEMI; break; case '=' : tokenType = EQ; break; case ',' : tokenType = COMMA; break; case '|' : tokenType = BAR; break; case '@' : tokenType = AT; break; case '~' : tokenType = WAVE; break; case '%' : tokenType = PERCENT; break; case 0 : tokenType = STREAM_END; working-- /*avoid buffer overflow*/; break; } if (tokenType != ERROR){ token.append(c); token.append(0); state = DONE; break; // START } switch(c){ case '$' : state = IN_VAR_FIRST; token.append(c); break; case '\'' : state = IN_QUOTE; break; case '\\' : state = IN_BSLASH; break; default: if (isWhiteSpace(c)){ // state = START; //do nothing } else if (isDigit(c)){ state = IN_NUM; token.append(c); } else if (isAlphabet(c)){ state = IN_STRING; token.append(c); } else {terminated(ERROR);} } break;//START case IN_NUM: if (isDigit(c)){ token.append(c); } else { terminated(NUMBER); } break;//IN_NUM case IN_VAR_FIRST: if (isAlphabet(c)){ token.append(c); state = IN_VAR; } else { terminated(ERROR); } break; // IN_VAR_FISRT case IN_VAR: if (isAlphabet(c) || isDigit(c)){ token.append(c); } else { terminated(VAR); } break;//IN_VAR case IN_STRING: // About the scanner's behavior for STRING, AT, and ESCAPE: // All of them can be contacted with each other. // This means the scanner will eat up as much as possible strings // (STRING, AT, and ESCAPE) at one time, with no regard of their // combining sequence. // if (c == '\''){ state = IN_QUOTE; // the first time we see single quote } else if (c =='\\'){ // back slash character state = IN_BSLASH; } else if (isAlphabet(c) || isDigit(c)){ token.append(c); } else{ terminated(STRING); } break;//IN_STRING case IN_QUOTE: if (c == '\''){ // the second time we see single quote state = IN_STRING; // see document in IN_STRING } else if ( c== '\\') { // backslah escape in quote state = IN_QUOTE_BSLASH; } else { token.append(c); // eat up everything, includes back slash } break;//IN_QUOTE case IN_QUOTE_BSLASH: case IN_BSLASH: switch (c){ case 'n' : token.append('\n'); break; case 'r' : token.append('\r'); break; case 't' : token.append('\t'); break; case '\'' : token.append('\''); break; case '\\' : token.append('\\'); break; default: token.append(c); // unknown escaping, treat it as literal } if (state == IN_BSLASH){ state = IN_STRING; // see document in IN_STRING } else { // state == IN_QUOTE_BSLASH state = IN_QUOTE; } break;//IN_BSLASH case DONE: /* should never happen */ default: working--; tokenType = ERROR; state = DONE; break; }//switch(state) }//while (state != DONE) return tokenType; } };//class Scanner class Parser{ friend UBool TestParser(); friend class TestParserT; friend class LanguageGenerator_impl; private: Scanner s; TokenType & token; int min_max; // for the evil infinite UBool match(TokenType expected){ if (token == expected) { token = s.getNextToken(); return TRUE; } else { //s.dumpCurrentPoint(); return FALSE; } } UBool weight(int & value){ if (token == NUMBER){ int temp = atoi(s.token); match(NUMBER); if (match(PERCENT)){ value = temp; return TRUE; } } return FALSE; } UBool repeat (Pick* &node /*in,out*/){ if (node == NULL) return FALSE; int count = -2; int min = -2; int max = -2; UBool question = FALSE; switch (token){ case QUESTION: match(QUESTION); min = 0; max = 1; count = 2; question = TRUE; break; case STAR: match(STAR); min = 0; max = -1; count = -1; break; case PLUS: match(PLUS); min = 1; max = -1; count = -1; break; case LBRACE: match(LBRACE); if (token != NUMBER){ return FALSE; }else { min = atoi(s.token); match(NUMBER); if (token == RBRACE){ match(RBRACE); max = min; count = 1; } else if (token == COMMA) { match(COMMA); if (token == RBRACE){ match(RBRACE); max = -1; count = -1; } else if (token == NUMBER) { max = atoi(s.token); match(NUMBER); count = max - min + 1; if (!match(RBRACE)) { return FALSE; } } else { return FALSE; } } else { return FALSE; } } break; default: return FALSE; } if (count == -2 || min == -2 || max == -2){ //ASSERT(FALSE); return FALSE; } // eat up following weights Buffer_int weights; int w; while (weight(w)){ weights.append(w); } // for the evil infinite min_max = min_max > min ? min_max : min; min_max = min_max > max ? min_max : max; if (min_max > PSEUDO_INFINIT){ return FALSE; // PSEUDO_INFINIT is less than the real maximum } if (max == -1){ // the evil infinite max = PSEUDO_INFINIT; } // for the strange question mark if (question && weights.content_size() > 0){ Buffer_int w2; w2.append(DEFAULT_WEIGHT - weights[0]).append(weights[0]); node = new Repeat(node,min,max,&w2); return TRUE; } node = new Repeat(node,min,max,&weights); return TRUE; } UBool core(Pick* &node /*out*/){ if (node != NULL) return FALSE; //assert node == NULL switch(token){ case LPAR: match(LPAR); if(defination(node) && match(RPAR)){ return TRUE; } return FALSE; case VAR: node = new Variable(&symbols, s.token); match(VAR); return TRUE; case STRING: node = new Literal(s.token); match(STRING); return TRUE; default: return FALSE; } } UBool modified(Pick* &node /*out*/){ if (node != NULL) return FALSE; //assert node == NULL if (!core(node)) { return FALSE; } for (;;){ switch(token){ case WAVE: match(WAVE); node = new Morph(*node); break; case AT: match(AT); node = new Quote(*node); break; case QUESTION: case STAR: case PLUS: case LBRACE: if (!repeat(node)) return FALSE; break; case SEMI: // rule definiation closed case RPAR: // within parenthesis (core closed) case BAR: // in alternation case NUMBER: // in alternation, with weight case LPAR: // in sequence case VAR: // in sequence case STRING: // in sequence return TRUE; default: return FALSE; } } } UBool sequence_list(Pick* &node /*in,out*/){ if (node == NULL) return FALSE; // assert node != NULL Sequence* seq = new Sequence(); Pick * n = node; while (token == VAR || token == STRING || token == LPAR){ seq->append(n); n = NULL; if (modified(n)){ // go on } else { goto FAIL; } } if (token == SEMI || token == RPAR || token == BAR){ seq->append(n); node = seq; return TRUE; } FAIL: delete seq; return FALSE; } UBool sequence(Pick* &node /*out*/){ if (node != NULL) return FALSE; //assert node == NULL if (!modified(node)) { return FALSE; } if (token == VAR || token == STRING || token == LPAR){ return sequence_list(node); } else { return TRUE; // just a modified } } UBool alternation_list(Pick* &node /*in,out*/){ if (node == NULL) return FALSE; // assert node != NULL Alternation * alt = new Alternation(); Pick * n = node; int w = DEFAULT_WEIGHT; while (token == NUMBER || token == BAR){ if(token == NUMBER) { if (weight(w)){ if (token == BAR){ // the middle item, go on } else { // the last item or encounter error break; //while } } else { goto FAIL; } } // else token == BAR match(BAR); alt->append(n,w); n = NULL; w = DEFAULT_WEIGHT; if (sequence(n)){ // go on } else { goto FAIL; } } if (token == SEMI || token == RPAR) { alt->append(n,w); node = alt; return TRUE; } FAIL: delete alt; return FALSE; } UBool alternation(Pick* &node /*out*/){ if (node != NULL) return FALSE; //assert node == NULL // 'sequence' has higher precedence than 'alternation' if (!sequence(node)){ return FALSE; } if (token == BAR || token == NUMBER){ // find a real alternation1, create it. return alternation_list(node); } else { return TRUE; // just a sequence_old } } UBool defination(Pick* &node /*out*/){ if (node != NULL) return FALSE; //assert node == NULL return alternation(node); } UBool rule(){ if (token == VAR){ Buffer_char name; name.append_array(s.token, strlen(s.token) + 1); match(VAR); if (match(EQ)){ Pick * t = NULL; if(defination(t)){ symbols.put(name, t); return match(SEMI); } } } return FALSE; } public: UBool rules(){ symbols.reset(); token = s.getNextToken(); while (rule()){ } if (token == STREAM_END){ return TRUE; } else { //s.dumpCurrentPoint(); return FALSE; } } public: SymbolTable symbols; Parser(const char *const source):s(source), token(s.tokenType){ min_max = -2; } UBool parse(){ return rules(); } }; // class Parser /////////////////////////////////////////////////////////// // // // int DumpScanner(Scanner & s, UBool dump = TRUE){ int len = strlen(s.source); int error_start_offset = s.history - s.source; if (dump){ printf("\n=================== DumpScanner ================\n"); fwrite(s.source, len, 1, stdout); printf("\n-----parsed-------------------------------------\n"); fwrite(s.source, s.history - s.source, 1, stdout); printf("\n-----current------------------------------------\n"); fwrite(s.history, s.working - s.history, 1, stdout); printf("\n-----unparsed-----------------------------------\n"); fwrite(s.working, (s.source + len - s.working), 1, stdout); printf("\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n"); } return error_start_offset; } class LanguageGenerator_impl{ public: LanguageGenerator_impl(const char *const bnf_definition, const char *const top_node) :par(bnf_definition), top_node_name(top_node){ srand((unsigned)time( NULL )); } LanguageGenerator::PARSE_RESULT parseBNF(UBool debug = TRUE){ if (par.parse()){ if (par.symbols.find(top_node_name, &top_node_ref) == SymbolTable::HAS_REF) { if (par.symbols.is_complete()) { return LanguageGenerator::OK; } else { if (debug) printf("The bnf definition is incomplete.\n"); return LanguageGenerator::INCOMPLETE; } } else { if (debug) printf("No top node is found.\n"); return LanguageGenerator::NO_TOP_NODE; } } else { if(debug) { printf("The bnf definition is wrong\n"); DumpScanner(par.s, TRUE); } return LanguageGenerator::BNF_DEF_WRONG; } } const char * next(){ return top_node_ref->next(); } private: Parser par; const char *const top_node_name; Pick * top_node_ref; }; LanguageGenerator::LanguageGenerator():lang_gen(NULL){ } LanguageGenerator::~LanguageGenerator(){ delete lang_gen; } LanguageGenerator::PARSE_RESULT LanguageGenerator::parseBNF(const char *const bnf_definition /*in*/, const char *const top_node/*in*/, UBool debug){ if (lang_gen){ delete lang_gen; } lang_gen = new LanguageGenerator_impl(bnf_definition, top_node); PARSE_RESULT r = lang_gen->parseBNF(debug); if (r != OK){ delete lang_gen; lang_gen = NULL; return r; } else { return r; } } const char *LanguageGenerator::next(){ // Return a null-terminated c-string. The buffer is owned by callee. if (lang_gen){ return lang_gen->next(); }else { return ""; } } /////////////////////////////////////////////////////////// // // The test code for WBNF // #define CALL(fun) \ if (fun()){ \ printf("Pass: " #fun "\n");\ } else { \ printf("FAILED: !!! " #fun " !!!\n"); \ } #define DUMP_R(fun, var, times) \ {printf("\n========= " #fun " =============\n"); \ for (int i=0; i<times; i++) { \ const char * t = var.next();\ fwrite(t,strlen(t),1,stdout); \ printf("\n"); \ } \ printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n");} #if TEST_WBNF_TEST static UBool TestQuote(){ const char *const str = "This ' A !,z| qq [] .new\tline"; //const char *const str_r = "This \\' A '!,'z'|' qq '[]' '.'new\tline"; //// //// :( we must quote our string to following C syntax //// cannot type the literal here, it makes our code rather human unreadable //// very very unconformable! //// ///* //*/ //const char *const s1 = "ab'c"; //const char (* s1_r1) [] = { "ab''c", // ab''c // "ab\\'c", // ab\'c // };// ///* // . '.' \. // .. \.\. '.'\. '.'\. '..' // '.''.' wrong //*/ //const char *const s2 = "a..'.b"; // a..'.b //const char (*s2_r) [] = { "a'..''.'b" // a'..''.'b // ,"a'..\\'.'b" // a'..\'.'b // ,"a'..'\\''.'b" // a'..'\''.'b // };// //const char *const s3 = "a..\\.b"; // a..\.b //const char (*s3_r) [] = { "a'..\\\\.'b" // a'..\\.'b // ,"a'..'\\\\'.'b" // a'..'\\'.'b // };// // // no catact operation, no choice, must be compact srand((unsigned)time( NULL )); //Escaper l(Escaper::NO, Escaper::NO, Escaper::RAND_ESC); Pick *p = new Literal(str); Quote q(*p); DUMP_R(TestQuote, (*p), 1); DUMP_R(TestQuote, q, 20); return FALSE; } static UBool TestLiteral(){ const char * s = "test string99."; Literal n(s); const char * r = n.next(); return strcmp(s,r) == 0; } static UBool TestSequence(){ Sequence seq; seq.append(new Literal("abc ")); seq.append(new Literal(", s")); return strcmp(seq.next(), "abc , s") == 0; } static UBool TestAlternation(){ srand((unsigned)time( NULL )); Alternation alt; alt.append(new Literal("aaa_10%"),10); alt.append(new Literal("bbb_0%"),0); alt.append(new Literal("ccc_10%"),10); alt.append(new Literal("ddddddd_50%"),50); DUMP_R(TestAlternation, alt, 50); return FALSE; } static UBool TestBuffer(){ Buffer_int t; t.append(1).append(0).append(5); int s = t.content_size(); for (int i=0; i<s; ++i){ printf("%d\n", t[i]); } return FALSE; } static UBool TestWeightedRand(){ srand((unsigned)time( NULL )); Buffer_int t; t.append(1).append(0).append(5); WeightedRand wr(&Buffer_int().append(10).append(0).append(50),4); // WeightedRand wr(&t,3); for (int i=0; i< 50; ++i){ printf("%d\n", wr.next()); } return FALSE; } static UBool TestRepeat(){ srand((unsigned)time( NULL )); Repeat rep(new Literal("aaa1-5 "), 1, 5); DUMP_R(TestRepeat, rep, 50); Repeat r2(new Literal("b{1,3}1%0%5% "), 1, 3, &Buffer_int().append(1).append(0).append(5)); DUMP_R(TestRepeat, r2, 50); Repeat r3(new Literal("aaa5-5 "), 5, 5); DUMP_R(TestRepeat, r3, 50); return FALSE; } static UBool TestVariable(){ SymbolTable tab; Pick * value = new Literal("string1"); Variable var1(&tab, "x", value); Variable var2(&tab, "y"); // tab.put(var2, value); // TOFIX: point alias/recursion problem Pick * value2 = new Literal("string2"); tab.put(var2, value2); Pick * value3 = new Literal("string3"); Variable var3(&tab, "z"); tab.put("z", value3); UBool pass; pass = strcmp(var1.next(), value->next()) == 0; pass = pass && strcmp(var2.next(), value2->next()) == 0; pass = pass && strcmp(var3.next(), value3->next()) == 0; return pass; } static UBool TestSymbolTable(){ Literal * n1 = new Literal("string1"); Literal * n2 = new Literal("string2"); SymbolTable t; t.put("abc", n1); t.put("$aaa", n2); // t.put("alias", n1); // TOFIX: point alias/recursion problem t.put("bbb"); UBool pass; pass = t.find(NULL) == SymbolTable::EMPTY; pass = pass && t.find("ccc") == SymbolTable::NO_VAR; pass = pass && t.find("bbb") == SymbolTable::NO_REF; pass = pass && t.find("abc") == SymbolTable::HAS_REF; pass = pass && t.find("$aaa") == SymbolTable::HAS_REF; t.reset(); pass = pass && t.find("abc") == SymbolTable::NO_VAR; return pass; } static UBool TestScanner(void){ //const char str1[] = "$root = $command{0,5} $reset $mostRules{1,20};"; //const char str1_r[][20] = {"$root", "=", "$command", "{", "0", ",", "5", "}", // "$reset", "$mostRules", "{", "1", ",", "20", "}", ";"}; const char str2[] = "$p2 =(\\\\ $s $string $s)? 25%;"; const char str2_r[][20] = {"$p2", "=", "(", "\\", "$s", "$string", "$s", ")", "?", "25", "%", ";"}; const char *str = str2; const char (*str_r)[20] = str2_r; int tokenNum = sizeof(str2_r)/sizeof(char[20]); Scanner t(str); UBool pass = TRUE; t.getNextToken(); int i = 0; while (pass){ if (t.tokenType == STREAM_END){ pass = pass? i == tokenNum : FALSE; break;//while } else if (t.tokenType == ERROR){ pass = FALSE; break;//while } else { pass = strcmp( &(t.token[0]), str_r[i++]) == 0; t.getNextToken(); } } //const char ts[] = "$commandList = '['" //" ( alternate ' ' $alternateOptions" //" | backwards ' 2'" //" | normalization ' ' $onoff " //" | caseLevel ' ' $onoff " //" | hiraganaQ ' ' $onoff" //" | caseFirst ' ' $caseFirstOptions" //" | strength ' ' $strengthOptions" //" ) ']';" ; //Scanner t2(ts); //pass = TRUE; //do { // t2.getNextToken(); // if (t2.tokenType == ERROR){ // DumpScanner(t2); // return FALSE; // } //}while (t.tokenType != STREAM_END); return pass; } class TestParserT { public: UBool operator () (const char *const str, const int exp_error_offset = -1, const UBool dump = TRUE){ Parser par(str); if (par.rules()){ if ( exp_error_offset == -1){ return TRUE; }else { DumpScanner(par.s,dump); return FALSE; } }else { return DumpScanner(par.s, dump) == exp_error_offset; } } }; UBool TestParser(){ TestParserT test; UBool pass = TRUE; pass = pass && test ("$s = ' ' ? 50%;"); pass = pass && test("$x = ($var {1,2}) 3%;"); // legal pass = pass && test("$x = $var {1,2} 3% | b 4%;"); // legal pass = pass && test("$x = $var {1,2} 3%;"); // legal pass = pass && test("$m = $c ? 2% 4% | $r 5% | $n 25%;"); // legal pass = pass && test("$a = b ? 2% | c 5%;"); // legal pass = pass && test("$x = A B 5% C 10% | D;", 8, FALSE); // illegal 5% pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc pass = pass && test("$x = (b 5%) (c 6%);"); // legal pass = pass && test("$x = (b 5%) c 6%;", 13, FALSE); // illegal 6% pass = pass && test("$x = b 5% (c 6%);", 9, FALSE); // illegal (c 6%) pass = pass && test("$x = b 5% c 6%;", 9, FALSE); // illegal c 6% pass = pass && test("$x = b 5%;"); // legal pass = pass && test("$x = aa 45% | bb 5% cc;", 19, FALSE);// illegal cc pass = pass && test("$x = a | b | c 4% | d 5%;"); // legal pass = pass && test("$s = ' ' ? 50% abc;"); // legal pass = pass && test("$s = a | c d | e f;"); // legal pass = pass && test( "$z = q 0% | p 1% | r 100%;"); // legal How to check parsed tree?? pass = pass && test("$s = ' ' ? 50%;"); pass = pass && test("$relationList = '<' | '<<' | ';' | '<<<' | ',' | '=';"); pass = pass && test("$p1 = ($string $s '|' $s)? 25%;"); pass = pass && test("$p2 = (\\\\ $s $string $s)? 25%;"); pass = pass && test("$rel2 = $p1 $string $s $p2;"); pass = pass && test("$relation = $relationList $s ($rel1 | $rel2) $crlf;"); pass = pass && test("$command = $commandList $crlf;"); pass = pass && test("$reset = '&' $s ($beforeList $s)? 10% ($positionList 100% | $string 10%) $crlf;"); pass = pass && test("$mostRules = $command 1% | $reset 5% | $relation 25%;"); pass = pass && test("$root = $command{0,5} $reset $mostRules{1,20};"); const char collationBNF[] = "$s = ' '? 50%;" "$crlf = '\r\n';" "$alternateOptions = non'-'ignorable | shifted;" "$onoff = on | off;" "$caseFirstOptions = off | upper | lower;" "$strengthOptions = '1' | '2' | '3' | '4' | 'I';" "$commandList = '['" " ( alternate ' ' $alternateOptions" " | backwards ' 2'" " | normalization ' ' $onoff " " | caseLevel ' ' $onoff " " | hiraganaQ ' ' $onoff" " | caseFirst ' ' $caseFirstOptions" " | strength ' ' $strengthOptions" " ) ']';" "$command = $commandList $crlf;" "$ignorableTypes = (tertiary | secondary | primary) ' ' ignorable;" "$allTypes = variable | regular | implicit | trailing | $ignorableTypes;" "$positionList = '[' (first | last) ' ' $allTypes ']';" "$beforeList = '[before ' ('1' | '2' | '3') ']';" "$relationList = (" " '<'" " | '<<'" " | ';'" " | '<<<'" " | ','" " | '='" ");" "$string = $magic;" "$rel1 = '[variable top]' $s;" "$p1 = ($string $s '|' $s)? 25%;" "$p2 = (\\\\ $s $string $s)? 25%;" "$rel2 = $p1 $string $s $p2;" "$relation = $relationList $s ($rel1 | $rel2) $crlf;" "$reset = '&' $s ($beforeList $s)? 10% ($positionList 1% | $string 10%) $crlf;" "$mostRules = $command 1% | $reset 5% | $relation 25%;" "$root = $command{0,5} $reset $mostRules{1,20};" ; pass = pass && test(collationBNF); return pass; } static UBool TestMorph(){ srand((unsigned)time( NULL )); Alternation * alt = new Alternation(); (*alt) .append(new Literal("a")).append(new Literal("b")).append(new Literal("c")) .append(new Literal("d")).append(new Literal("e")).append(new Literal("f")) .append(new Literal("g")).append(new Literal("h")).append(new Literal("i")) .append(new Literal("j")).append(new Literal("k")).append(new Literal("l")) .append(new Literal("m")).append(new Literal("n")).append(new Literal("o")) ; Repeat * rep = new Repeat( alt ,5,5 ); Morph m( *rep); // DUMP_R(TestMorph,(*rep),20); DUMP_R(TestMorph,m,100); return FALSE; } #endif static UBool TestLanguageGenerator(){ //LanguageGenerator g; //const char *const s = "$s = p 0% | q 1%;"; //g.parseBNF(s, "$s"); UBool pass; //= strcmp("q", g.next()) == 0; const char *const def = //"$a = $b;" //"$b = $c;" //"$c = $t;" //"$t = abc $z{1,2};" //"$k = a | b | c | d | e | f | g ;" //"$z = q 0% | p 1% | r 1%;" "$x = a ? 0%;" ; // end of string // const char * s = "abczz"; // // LanguageGenerator g; pass = g.parseBNF(def, "$x",TRUE); //// LanguageGenerator g(collationBNF, "$root", "$magic", new MagicNode()); // if (pass != LanguageGenerator::OK) return FALSE; DUMP_R(TestLanguageGenerator, g, 20); return pass; ////UBool pass = strcmp(s,r) == 0; //if (pass){ // printf("TestRandomLanguageGenerator passed.\n"); //} else { // printf("TestRandomLanguageGenerator FAILED!!!\n"); //} //return pass; } void TestWbnf(void){ srand((unsigned)time( NULL )); //CALL(TestLiteral); //CALL(TestSequence); //CALL(TestSymbolTable); //CALL(TestVariable); //TestRepeat(); //TestAlternation(); //TestMorph(); //TestQuote(); //TestBuffer(); //TestWeightedRand(); //CALL(TestScanner); //CALL(TestParser); CALL(TestLanguageGenerator); }