/** \file * Defines the the class interface for an antlr3 INTSTREAM. * * Certain functionality (such as DFAs for instance) abstract the stream of tokens * or characters in to a steam of integers. Hence this structure should be included * in any stream that is able to provide the output as a stream of integers (which is anything * basically. * * There are no specific implementations of the methods in this interface in general. Though * for purposes of casting and so on, it may be necesssary to implement a function with * the signature in this interface which abstracts the base immplementation. In essence though * the base stream provides a pointer to this interface, within which it installs its * normal match() functions and so on. Interaces such as DFA are then passed the pANTLR3_INT_STREAM * and can treat any input as an int stream. * * For instance, a lexer implements a pANTLR3_BASE_RECOGNIZER, within which there is a pANTLR3_INT_STREAM. * However, a pANTLR3_INPUT_STREAM also provides a pANTLR3_INT_STREAM, which it has constructed from * it's normal interface when it was created. This is then pointed at by the pANTLR_BASE_RECOGNIZER * when it is intialized with a pANTLR3_INPUT_STREAM. * * Similarly if a pANTLR3_BASE_RECOGNIZER is initialized with a pANTLR3_TOKEN_STREAM, then the * pANTLR3_INT_STREAM is taken from the pANTLR3_TOKEN_STREAM. * * If a pANTLR3_BASE_RECOGNIZER is initialized with a pANTLR3_TREENODE_STREAM, then guess where * the pANTLR3_INT_STREAM comes from? * * Note that because the context pointer points to the actual interface structure that is providing * the ANTLR3_INT_STREAM it is defined as a (void *) in this interface. There is no direct implementation * of an ANTLR3_INT_STREAM (unless someone did not understand what I was doing here =;?P */ #ifndef _ANTLR3_INTSTREAM_HPP #define _ANTLR3_INTSTREAM_HPP // [The "BSD licence"] // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // 3. The name of the author may not be used to endorse or promote products // derived from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include <cassert> #include "antlr3defs.hpp" ANTLR_BEGIN_NAMESPACE() enum STREAM_TYPE { /** Type indicator for a character stream * \remark if a custom stream is created but it can be treated as * a char stream, then you may OR in this value to your type indicator */ CHARSTREAM = 0x0001 /** Type indicator for a Token stream * \remark if a custom stream is created but it can be treated as * a token stream, then you may OR in this value to your type indicator */ , TOKENSTREAM = 0x0002 /** Type indicator for a common tree node stream * \remark if a custom stream is created but it can be treated as * a common tree node stream, then you may OR in this value to your type indicator */ , COMMONTREENODE = 0x0004 /** Type mask for input stream so we can switch in the above types * \remark DO NOT USE 0x0000 as a stream type! */ , INPUT_MASK = 0x0007 }; class RESOLVE_ENDIAN_AT_RUNTIME {}; class BYTE_AGNOSTIC {}; class ANTLR_LITTLE_ENDIAN {}; class ANTLR_BIG_ENDIAN {}; template<class ImplTraits, class SuperType> class IntStream : public ImplTraits::AllocPolicyType { public: typedef typename ImplTraits::StringType StringType; protected: /** Potentially useful in error reporting and so on, this string is * an identification of the input source. It may be NULL, so anything * attempting to access it needs to check this and substitute a sensible * default. */ StringType m_streamName; /** Last marker position allocated */ ANTLR_MARKER m_lastMarker; bool m_upper_case; //if set, values should be returbed in upper case /// Indicates whether we should implement endian-specific logic /// 0 - Undefined 1 - Default(machine and input are both same), 2 - Little Endian, 3 - Big Endian ANTLR_UINT8 m_endian_spec; public: IntStream(); // Return a string that identifies the input source // StringType getSourceName(); StringType& get_streamName(); const StringType& get_streamName() const; ANTLR_MARKER get_lastMarker() const; SuperType* get_super(); /** * Function that installs a version of LA that always * returns upper case. Only valid for character streams and creates a case * insensitive lexer if the lexer tokens are described in upper case. The * tokens will preserve case in the token text. */ void setUcaseLA(bool flag); /** Consume the next 'ANTR3_UINT32' in the stream */ void consume(); /** Get ANTLR3_UINT32 at current input pointer + i ahead where i=1 is next ANTLR3_UINT32 */ ANTLR_UINT32 _LA( ANTLR_INT32 i); /** Tell the stream to start buffering if it hasn't already. Return * current input position, index(), or some other marker so that * when passed to rewind() you get back to the same spot. * rewind(mark()) should not affect the input cursor. */ ANTLR_MARKER mark(); /** Return the current input symbol index 0..n where n indicates the * last symbol has been read. */ ANTLR_MARKER index(); /** Reset the stream so that next call to index would return marker. * The marker will usually be index() but it doesn't have to be. It's * just a marker to indicate what state the stream was in. This is * essentially calling release() and seek(). If there are markers * created after this marker argument, this routine must unroll them * like a stack. Assume the state the stream was in when this marker * was created. */ void rewind(ANTLR_MARKER marker); /** Reset the stream to the last marker position, witouh destryoing the * last marker position. */ void rewindLast(); /** You may want to commit to a backtrack but don't want to force the * stream to keep bookkeeping objects around for a marker that is * no longer necessary. This will have the same behavior as * rewind() except it releases resources without the backward seek. */ void release(ANTLR_MARKER mark); /** Set the input cursor to the position indicated by index. This is * normally used to seek ahead in the input stream. No buffering is * required to do this unless you know your stream will use seek to * move backwards such as when backtracking. * * This is different from rewind in its multi-directional * requirement and in that its argument is strictly an input cursor (index). * * For char streams, seeking forward must update the stream state such * as line number. For seeking backwards, you will be presumably * backtracking using the mark/rewind mechanism that restores state and * so this method does not need to update state when seeking backwards. * * Currently, this method is only used for efficient backtracking, but * in the future it may be used for incremental parsing. */ void seek(ANTLR_MARKER index); /// Debug only method to flag consumption of initial off-channel /// tokens in the input stream /// void consumeInitialHiddenTokens(); void rewindMark(ANTLR_MARKER marker); ANTLR_MARKER tindex(); /** Frees any resources that were allocated for the implementation of this * interface. Usually this is just releasing the memory allocated * for the structure itself, but it may of course do anything it need to * so long as it does not stamp on anything else. */ ~IntStream(); protected: void setupIntStream(bool machineBigEndian, bool inputBigEndian); void findout_endian_spec(bool machineBigEndian, bool inputBigEndian); //If the user chooses this option, then we will be resolving stuffs at run-time ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> ); //resolve into one of the three categories below at runtime void consume( ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> ); }; template<class ImplTraits, class SuperType> class EBCDIC_IntStream : public IntStream<ImplTraits, SuperType> { public: ANTLR_UINT32 _LA( ANTLR_INT32 i); protected: void setupIntStream(); }; template<class ImplTraits, class SuperType> class UTF8_IntStream : public IntStream<ImplTraits, SuperType> { public: ANTLR_UINT32 _LA( ANTLR_INT32 i); void consume(); protected: void setupIntStream(bool machineBigEndian, bool inputBigEndian); private: static const ANTLR_UINT32* TrailingBytesForUTF8(); static const UTF32* OffsetsFromUTF8(); }; template<class ImplTraits, class SuperType> class UTF16_IntStream : public IntStream<ImplTraits, SuperType> { public: ANTLR_UINT32 _LA( ANTLR_INT32 i); void consume(); ANTLR_MARKER index(); void seek(ANTLR_MARKER seekPoint); protected: void setupIntStream(bool machineBigEndian, bool inputBigEndian); /// \brief Return the input element assuming an 8 bit ascii input /// /// \param[in] input Input stream context pointer /// \param[in] la 1 based offset of next input stream element /// /// \return Next input character in internal ANTLR3 encoding (UTF32) /// ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<BYTE_AGNOSTIC> ); /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not /// /// \param[in] input Input stream context pointer /// \param[in] la 1 based offset of next input stream element /// /// \return Next input character in internal ANTLR3 encoding (UTF32) /// ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<ANTLR_LITTLE_ENDIAN> ); /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not /// /// \param[in] input Input stream context pointer /// \param[in] la 1 based offset of next input stream element /// /// \return Next input character in internal ANTLR3 encoding (UTF32) /// ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<ANTLR_BIG_ENDIAN> ); /// \brief Consume the next character in a UTF16 input stream /// /// \param input Input stream context pointer /// void consume( ClassForwarder<BYTE_AGNOSTIC> ); /// \brief Consume the next character in a UTF16 input stream when the input is Little Endian and the machine is not /// Note that the UTF16 routines do not do any substantial verification of the input stream as for performance /// sake, we assume it is validly encoded. So if a low surrogate is found at the curent input position then we /// just consume it. Surrogate pairs should be seen as Hi, Lo. So if we have a Lo first, then the input stream /// is fubar but we just ignore that. /// /// \param input Input stream context pointer /// void consume( ClassForwarder<ANTLR_LITTLE_ENDIAN> ); /// \brief Consume the next character in a UTF16 input stream when the input is Big Endian and the machine is not /// /// \param input Input stream context pointer /// void consume( ClassForwarder<ANTLR_BIG_ENDIAN> ); }; template<class ImplTraits, class SuperType> class UTF32_IntStream : public IntStream<ImplTraits, SuperType> { public: ANTLR_UINT32 _LA( ANTLR_INT32 i); void consume(); /// \brief Calculate the current index in the output stream. /// \param[in] input Input stream context pointer /// ANTLR_MARKER index(); void seek(ANTLR_MARKER seekPoint); protected: void setupIntStream(bool machineBigEndian, bool inputBigEndian); ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> ); ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<BYTE_AGNOSTIC> ); ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<ANTLR_LITTLE_ENDIAN> ); ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<ANTLR_BIG_ENDIAN> ); void consume( ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> ); void consume( ClassForwarder<BYTE_AGNOSTIC> ); void consume( ClassForwarder<ANTLR_LITTLE_ENDIAN> ); void consume( ClassForwarder<ANTLR_BIG_ENDIAN> ); }; template<class ImplTraits> class TokenIntStream : public IntStream<ImplTraits, typename ImplTraits::TokenStreamType > { public: typedef typename ImplTraits::CommonTokenType CommonTokenType; typedef typename ImplTraits::StringType StringType; typedef typename ImplTraits::TokenStreamType TokenStreamType; typedef IntStream<ImplTraits, TokenStreamType > BaseType; private: /** Because the indirect call, though small in individual cases can * mount up if there are thousands of tokens (very large input streams), callers * of size can optionally use this cached size field. */ ANTLR_UINT32 m_cachedSize; public: TokenIntStream(); ANTLR_UINT32 get_cachedSize() const; void set_cachedSize( ANTLR_UINT32 cachedSize ); void consume(); void consumeInitialHiddenTokens(); ANTLR_UINT32 _LA( ANTLR_INT32 i ); ANTLR_MARKER mark(); ANTLR_UINT32 size(); void release(); ANTLR_MARKER tindex(); void rewindLast(); void rewind(ANTLR_MARKER marker); void seek(ANTLR_MARKER index); StringType getSourceName(); }; template<class ImplTraits> class TreeNodeIntStream : public IntStream<ImplTraits, typename ImplTraits::CommonTreeNodeStreamType> { public: typedef typename ImplTraits::CommonTreeNodeStreamType CommonTreeNodeStreamType; typedef IntStream<ImplTraits, CommonTreeNodeStreamType > BaseType; typedef typename ImplTraits::TreeType TreeType; typedef typename ImplTraits::CommonTokenType CommonTokenType; public: void consume(); ANTLR_MARKER tindex(); ANTLR_UINT32 _LA(ANTLR_INT32 i); ANTLR_MARKER mark(); void release(ANTLR_MARKER marker); void rewindMark(ANTLR_MARKER marker); void rewindLast(); void seek(ANTLR_MARKER index); ANTLR_UINT32 size(); }; ANTLR_END_NAMESPACE() #include "antlr3intstream.inl" #endif