/** \file * Defines the basic structure to support recognizing by either a lexer, * parser, or tree parser. * \addtogroup BaseRecognizer * @{ */ #ifndef _ANTLR3_BASERECOGNIZER_HPP #define _ANTLR3_BASERECOGNIZER_HPP // [The "BSD licence"] // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // 3. The name of the author may not be used to endorse or promote products // derived from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "antlr3defs.hpp" #include "antlr3collections.hpp" ANTLR_BEGIN_NAMESPACE() /** \brief Base tracking context structure for all types of * recognizers. */ template< class ImplTraits, class StreamType > class BaseRecognizer : public ImplTraits::AllocPolicyType { public: typedef typename ImplTraits::AllocPolicyType AllocPolicyType; typedef typename StreamType::IntStreamType IntStreamType; typedef typename ComponentTypeFinder<ImplTraits, StreamType>::ComponentType SuperType; typedef typename StreamType::UnitType UnitType; typedef typename ImplTraits::template ExceptionBaseType<StreamType> ExceptionBaseType; typedef typename ImplTraits::BitsetType BitsetType; typedef typename ImplTraits::BitsetListType BitsetListType; typedef typename ImplTraits::StringType StringType; typedef typename ImplTraits::template RecognizerSharedStateType<StreamType> RecognizerSharedStateType; typedef typename ImplTraits::DebugEventListenerType DebugEventListenerType; typedef typename ImplTraits::LexerType LexerType; typedef typename ImplTraits::ParserType ParserType; typedef typename ImplTraits::TreeParserType TreeParserType; typedef typename AllocPolicyType::template StackType<StringType> StringStackType; typedef typename AllocPolicyType::template ListType<StringType> StringListType; private: /// A pointer to the shared recognizer state, such that multiple /// recognizers can use the same inputs streams and so on (in /// the case of grammar inheritance for instance. /// RecognizerSharedStateType* m_state; /// If set to something other than NULL, then this structure is /// points to an instance of the debugger interface. In general, the /// debugger is only referenced internally in recovery/error operations /// so that it does not cause overhead by having to check this pointer /// in every function/method /// DebugEventListenerType* m_debugger; public: BaseRecognizer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state); SuperType* get_super(); RecognizerSharedStateType* get_state() const; DebugEventListenerType* get_debugger() const; void set_state( RecognizerSharedStateType* state ); void set_debugger( DebugEventListenerType* debugger ); /// Match current input symbol against ttype. Upon error, do one token /// insertion or deletion if possible. /// To turn off single token insertion or deletion error /// recovery, override mismatchRecover() and have it call /// plain mismatch(), which does not recover. Then any error /// in a rule will cause an exception and immediate exit from /// rule. Rule would recover by resynchronizing to the set of /// symbols that can follow rule ref. /// const UnitType* match(ANTLR_UINT32 ttype, BitsetListType* follow); /// Consumes the next token, whatever it is, and resets the recognizer state /// so that it is not in error. /// /// \param recognizer /// Recognizer context pointer /// void matchAny(); /// function that decides if the token ahead of the current one is the /// one we were loking for, in which case the curernt one is very likely extraneous /// and can be reported that way. /// bool mismatchIsUnwantedToken(IntStreamType* input, ANTLR_UINT32 ttype); /// function that decides if the current token is one that can logically /// follow the one we were looking for, in which case the one we were looking for is /// probably missing from the input. /// bool mismatchIsMissingToken(IntStreamType* input, BitsetListType* follow); /// Factor out what to do upon token mismatch so tree parsers can behave /// differently. Override and call mismatchRecover(input, ttype, follow) /// to get single token insertion and deletion. Use this to turn off /// single token insertion and deletion. Override mismatchRecover /// to call this instead. /// /// \remark mismatch only works for parsers and must be overridden for anything else. /// void mismatch(ANTLR_UINT32 ttype, BitsetListType* follow); /// Report a recognition problem. /// /// This method sets errorRecovery to indicate the parser is recovering /// not parsing. Once in recovery mode, no errors are generated. /// To get out of recovery mode, the parser must successfully match /// a token (after a resync). So it will go: /// /// 1. error occurs /// 2. enter recovery mode, report error /// 3. consume until token found in resynch set /// 4. try to resume parsing /// 5. next match() will reset errorRecovery mode /// /// If you override, make sure to update errorCount if you care about that. /// void reportError(); void reportError( ClassForwarder<LexerType> ); template<typename CompType> void reportError( ClassForwarder<CompType> ); /** Function that is called to display a recognition error message. You may * override this function independently of (*reportError)() above as that function calls * this one to do the actual exception printing. */ void displayRecognitionError(ANTLR_UINT8** tokenNames); /// Get number of recognition errors (lexer, parser, tree parser). Each /// recognizer tracks its own number. So parser and lexer each have /// separate count. Does not count the spurious errors found between /// an error and next valid token match /// /// \see reportError() /// ANTLR_UINT32 getNumberOfSyntaxErrors(); /** Function that recovers from an error found in the input stream. * Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also * be from a mismatched token that the (*match)() could not recover from. */ void recover(); /** function that is a hook to listen to token consumption during error recovery. * This is mainly used by the debug parser to send events to the listener. */ void beginResync(); /** function that is a hook to listen to token consumption during error recovery. * This is mainly used by the debug parser to send events to the listener. */ void endResync(); /** function that is a hook to listen to token consumption during error recovery. * This is mainly used by the debug parser to send events to the listener. */ void beginBacktrack(ANTLR_UINT32 level); /** function that is a hook to listen to token consumption during error recovery. * This is mainly used by the debug parser to send events to the listener. */ void endBacktrack(ANTLR_UINT32 level, bool successful); /// Compute the error recovery set for the current rule. /// Documentation below is from the Java implementation. /// /// During rule invocation, the parser pushes the set of tokens that can /// follow that rule reference on the stack; this amounts to /// computing FIRST of what follows the rule reference in the /// enclosing rule. This local follow set only includes tokens /// from within the rule; i.e., the FIRST computation done by /// ANTLR stops at the end of a rule. // /// EXAMPLE // /// When you find a "no viable alt exception", the input is not /// consistent with any of the alternatives for rule r. The best /// thing to do is to consume tokens until you see something that /// can legally follow a call to r *or* any rule that called r. /// You don't want the exact set of viable next tokens because the /// input might just be missing a token--you might consume the /// rest of the input looking for one of the missing tokens. /// /// Consider grammar: /// /// a : '[' b ']' /// | '(' b ')' /// ; /// b : c '^' INT ; /// c : ID /// | INT /// ; /// /// At each rule invocation, the set of tokens that could follow /// that rule is pushed on a stack. Here are the various "local" /// follow sets: /// /// FOLLOW(b1_in_a) = FIRST(']') = ']' /// FOLLOW(b2_in_a) = FIRST(')') = ')' /// FOLLOW(c_in_b) = FIRST('^') = '^' /// /// Upon erroneous input "[]", the call chain is /// /// a -> b -> c /// /// and, hence, the follow context stack is: /// /// depth local follow set after call to rule /// 0 <EOF> a (from main()) /// 1 ']' b /// 3 '^' c /// /// Notice that ')' is not included, because b would have to have /// been called from a different context in rule a for ')' to be /// included. /// /// For error recovery, we cannot consider FOLLOW(c) /// (context-sensitive or otherwise). We need the combined set of /// all context-sensitive FOLLOW sets--the set of all tokens that /// could follow any reference in the call chain. We need to /// resync to one of those tokens. Note that FOLLOW(c)='^' and if /// we resync'd to that token, we'd consume until EOF. We need to /// sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. /// In this case, for input "[]", LA(1) is in this set so we would /// not consume anything and after printing an error rule c would /// return normally. It would not find the required '^' though. /// At this point, it gets a mismatched token error and throws an /// exception (since LA(1) is not in the viable following token /// set). The rule exception handler tries to recover, but finds /// the same recovery set and doesn't consume anything. Rule b /// exits normally returning to rule a. Now it finds the ']' (and /// with the successful match exits errorRecovery mode). /// /// So, you can see that the parser walks up call chain looking /// for the token that was a member of the recovery set. /// /// Errors are not generated in errorRecovery mode. /// /// ANTLR's error recovery mechanism is based upon original ideas: /// /// "Algorithms + Data Structures = Programs" by Niklaus Wirth /// /// and /// /// "A note on error recovery in recursive descent parsers": /// http://portal.acm.org/citation.cfm?id=947902.947905 /// /// Later, Josef Grosch had some good ideas: /// /// "Efficient and Comfortable Error Recovery in Recursive Descent /// Parsers": /// ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip /// /// Like Grosch I implemented local FOLLOW sets that are combined /// at run-time upon error to avoid overhead during parsing. /// BitsetType* computeErrorRecoverySet(); /// Compute the context-sensitive FOLLOW set for current rule. /// Documentation below is from the Java runtime. /// /// This is the set of token types that can follow a specific rule /// reference given a specific call chain. You get the set of /// viable tokens that can possibly come next (look ahead depth 1) /// given the current call chain. Contrast this with the /// definition of plain FOLLOW for rule r: /// /// FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} /// /// where x in T* and alpha, beta in V*; T is set of terminals and /// V is the set of terminals and non terminals. In other words, /// FOLLOW(r) is the set of all tokens that can possibly follow /// references to r in///any* sentential form (context). At /// runtime, however, we know precisely which context applies as /// we have the call chain. We may compute the exact (rather /// than covering superset) set of following tokens. /// /// For example, consider grammar: /// /// stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} /// | "return" expr '.' /// ; /// expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} /// atom : INT // FOLLOW(atom)=={'+',')',';','.'} /// | '(' expr ')' /// ; /// /// The FOLLOW sets are all inclusive whereas context-sensitive /// FOLLOW sets are precisely what could follow a rule reference. /// For input input "i=(3);", here is the derivation: /// /// stat => ID '=' expr ';' /// => ID '=' atom ('+' atom)* ';' /// => ID '=' '(' expr ')' ('+' atom)* ';' /// => ID '=' '(' atom ')' ('+' atom)* ';' /// => ID '=' '(' INT ')' ('+' atom)* ';' /// => ID '=' '(' INT ')' ';' /// /// At the "3" token, you'd have a call chain of /// /// stat -> expr -> atom -> expr -> atom /// /// What can follow that specific nested ref to atom? Exactly ')' /// as you can see by looking at the derivation of this specific /// input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. /// /// You want the exact viable token set when recovering from a /// token mismatch. Upon token mismatch, if LA(1) is member of /// the viable next token set, then you know there is most likely /// a missing token in the input stream. "Insert" one by just not /// throwing an exception. /// BitsetType* computeCSRuleFollow(); /// Compute the current followset for the input stream. /// BitsetType* combineFollows(bool exact); /// Attempt to recover from a single missing or extra token. /// /// EXTRA TOKEN /// /// LA(1) is not what we are looking for. If LA(2) has the right token, /// however, then assume LA(1) is some extra spurious token. Delete it /// and LA(2) as if we were doing a normal match(), which advances the /// input. /// /// MISSING TOKEN /// /// If current token is consistent with what could come after /// ttype then it is ok to "insert" the missing token, else throw /// exception For example, Input "i=(3;" is clearly missing the /// ')'. When the parser returns from the nested call to expr, it /// will have call chain: /// /// stat -> expr -> atom /// /// and it will be trying to match the ')' at this point in the /// derivation: /// /// => ID '=' '(' INT ')' ('+' atom)* ';' /// ^ /// match() will see that ';' doesn't match ')' and report a /// mismatched token error. To recover, it sees that LA(1)==';' /// is in the set of tokens that can follow the ')' token /// reference in rule atom. It can assume that you forgot the ')'. /// /// The exception that was passed in, in the java implementation is /// sorted in the recognizer exception stack in the C version. To 'throw' it we set the /// error flag and rules cascade back when this is set. /// const UnitType* recoverFromMismatchedToken( ANTLR_UINT32 ttype, BitsetListType* follow); /** Function that recovers from a mismatched set in the token stream, in a similar manner * to (*recoverFromMismatchedToken) */ const UnitType* recoverFromMismatchedSet(BitsetListType* follow); /** common routine to handle single token insertion for recovery functions. */ /// This code is factored out from mismatched token and mismatched set /// recovery. It handles "single token insertion" error recovery for /// both. No tokens are consumed to recover from insertions. Return /// true if recovery was possible else return false. /// bool recoverFromMismatchedElement(BitsetListType* follow); /** function that consumes input until the next token matches * the given token. */ void consumeUntil(ANTLR_UINT32 tokenType); /** function that consumes input until the next token matches * one in the given set. */ void consumeUntilSet(BitsetType* set); /** function that returns an ANTLR3_LIST of the strings that identify * the rules in the parser that got you to this point. Can be overridden by installing your * own function set. * * \todo Document how to override invocation stack functions. */ StringStackType getRuleInvocationStack(); StringStackType getRuleInvocationStackNamed(ANTLR_UINT8* name); /** function that converts an ANLR3_LIST of tokens to an ANTLR3_LIST of * string token names. As this is mostly used in string template processing it may not be useful * in the C runtime. */ StringListType toStrings( const StringListType& ); /** function to return whether the rule has parsed input starting at the supplied * start index before. If the rule has not parsed input starting from the supplied start index, * then it will return ANTLR3_MEMO_RULE_UNKNOWN. If it has parsed from the suppled start point * then it will return the point where it last stopped parsing after that start point. */ ANTLR_MARKER getRuleMemoization( ANTLR_INTKEY ruleIndex, ANTLR_MARKER ruleParseStart); /** function that determines whether the rule has parsed input at the current index * in the input stream */ bool alreadyParsedRule(ANTLR_MARKER ruleIndex); /** Function that records whether the rule has parsed the input at a * current position successfully or not. */ void memoize(ANTLR_MARKER ruleIndex, ANTLR_MARKER ruleParseStart); /// Function that returns the current input symbol. /// The is placed into any label for the associated token ref; e.g., x=ID. Token /// and tree parsers need to return different objects. Rather than test /// for input stream type or change the IntStream interface, I use /// a simple method to ask the recognizer to tell me what the current /// input symbol is. /// /// This is ignored for lexers and the lexer implementation of this /// function should return NULL. /// const UnitType* getCurrentInputSymbol(IntStreamType* istream); const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<LexerType>); const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<ParserType>); const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<TreeParserType>); /// Conjure up a missing token during error recovery. /// /// The recognizer attempts to recover from single missing /// symbols. But, actions might refer to that missing symbol. /// For example, x=ID {f($x);}. The action clearly assumes /// that there has been an identifier matched previously and that /// $x points at that token. If that token is missing, but /// the next token in the stream is what we want we assume that /// this token is missing and we keep going. Because we /// have to return some token to replace the missing token, /// we have to conjure one up. This method gives the user control /// over the tokens returned for missing tokens. Mostly, /// you will want to create something special for identifier /// tokens. For literals such as '{' and ',', the default /// action in the parser or tree parser works. It simply creates /// a CommonToken of the appropriate type. The text will be the token. /// If you change what tokens must be created by the lexer, /// override this method to create the appropriate tokens. /// UnitType* getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e, ANTLR_UINT32 expectedTokenType, BitsetListType* follow); /** Function that returns whether the supplied grammar function * will parse the current input stream or not. This is the way that syntactic * predicates are evaluated. Unlike java, C is perfectly happy to invoke code * via a pointer to a function (hence that's what all the ANTLR3 C interfaces * do. */ template<typename Predicate> bool synpred( ClassForwarder<Predicate> ); //In place of exConstruct, just directly instantiate the Exception Object /** Reset the recognizer */ void reset(); void reset( ClassForwarder<LexerType> ); template<typename CompType> void reset( ClassForwarder<CompType> ); void exConstruct(); ~BaseRecognizer(); }; ANTLR_END_NAMESPACE() #include "antlr3baserecognizer.inl" /// @} /// #endif /* _ANTLR3_BASERECOGNIZER_H */