# Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html #***************************************************************************** # # Copyright (C) 2002-2015, International Business Machines Corporation and others. # All Rights Reserved. # #***************************************************************************** # # file: regexcst.txt # ICU Regular Expression Parser State Table # # This state table is used when reading and parsing a regular expression pattern # The pattern parser uses a state machine; the data in this file define the # state transitions that occur for each input character. # # *** This file defines the regex pattern grammar. This is it. # *** The determination of what is accepted is here. # # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays # that are then built with the rule parser. # # # Here is the syntax of the state definitions in this file: # # #StateName: # input-char n next-state ^push-state action # input-char n next-state ^push-state action # | | | | | # | | | | |--- action to be performed by state machine # | | | | See function RBBIRuleScanner::doParseActions() # | | | | # | | | |--- Push this named state onto the state stack. # | | | Later, when next state is specified as "pop", # | | | the pushed state will become the current state. # | | | # | | |--- Transition to this state if the current input character matches the input # | | character or char class in the left hand column. "pop" causes the next # | | state to be popped from the state stack. # | | # | |--- When making the state transition specified on this line, advance to the next # | character from the input only if 'n' appears here. # | # |--- Character or named character classes to test for. If the current character being scanned # matches, peform the actions and go to the state specified on this line. # The input character is tested sequentally, in the order written. The characters and # character classes tested for do not need to be mutually exclusive. The first match wins. # # # start state, scan position is at the beginning of the pattern. # start: default term doPatStart # # term. At a position where we can accept the start most items in a pattern. # term: quoted n expr-quant doLiteralChar rule_char n expr-quant doLiteralChar '[' n set-open ^set-finish doSetBegin '(' n open-paren '.' n expr-quant doDotAny '^' n expr-quant doCaret '$' n expr-quant doDollar '\' n backslash '|' n term doOrOperator ')' n pop doCloseParen eof term doPatFinish default errorDeath doRuleError # # expr-quant We've just finished scanning a term, now look for the optional # trailing quantifier - *, +, ?, *?, etc. # expr-quant: '*' n quant-star '+' n quant-plus '?' n quant-opt '{' n interval-open doIntervalInit '(' n open-paren-quant default expr-cont # # expr-cont Expression, continuation. At a point where additional terms are # allowed, but not required. No Quantifiers # expr-cont: '|' n term doOrOperator ')' n pop doCloseParen default term # # open-paren-quant Special case handling for comments appearing before a quantifier, # e.g. x(?#comment )* # Open parens from expr-quant come here; anything but a (?# comment # branches into the normal parenthesis sequence as quickly as possible. # open-paren-quant: '?' n open-paren-quant2 doSuppressComments default open-paren open-paren-quant2: '#' n paren-comment ^expr-quant default open-paren-extended # # open-paren We've got an open paren. We need to scan further to # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. # open-paren: '?' n open-paren-extended doSuppressComments default term ^expr-quant doOpenCaptureParen open-paren-extended: ':' n term ^expr-quant doOpenNonCaptureParen # (?: '>' n term ^expr-quant doOpenAtomicParen # (?> '=' n term ^expr-cont doOpenLookAhead # (?= '!' n term ^expr-cont doOpenLookAheadNeg # (?! '<' n open-paren-lookbehind '#' n paren-comment ^term 'i' paren-flag doBeginMatchMode 'd' paren-flag doBeginMatchMode 'm' paren-flag doBeginMatchMode 's' paren-flag doBeginMatchMode 'u' paren-flag doBeginMatchMode 'w' paren-flag doBeginMatchMode 'x' paren-flag doBeginMatchMode '-' paren-flag doBeginMatchMode '(' n errorDeath doConditionalExpr '{' n errorDeath doPerlInline default errorDeath doBadOpenParenType open-paren-lookbehind: '=' n term ^expr-cont doOpenLookBehind # (?<= '!' n term ^expr-cont doOpenLookBehindNeg # (?<! ascii_letter named-capture doBeginNamedCapture # (?<name default errorDeath doBadOpenParenType # # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' # paren-comment: ')' n pop eof errorDeath doMismatchedParenErr default n paren-comment # # paren-flag Scanned a (?ismx-ismx flag setting # paren-flag: 'i' n paren-flag doMatchMode 'd' n paren-flag doMatchMode 'm' n paren-flag doMatchMode 's' n paren-flag doMatchMode 'u' n paren-flag doMatchMode 'w' n paren-flag doMatchMode 'x' n paren-flag doMatchMode '-' n paren-flag doMatchMode ')' n term doSetMatchMode ':' n term ^expr-quant doMatchModeParen default errorDeath doBadModeFlag # # named-capture (?<name> ... ), position currently on the name. # named-capture: ascii_letter n named-capture doContinueNamedCapture digit_char n named-capture doContinueNamedCapture '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture. default errorDeath doBadNamedCapture # # quant-star Scanning a '*' quantifier. Need to look ahead to decide # between plain '*', '*?', '*+' # quant-star: '?' n expr-cont doNGStar # *? '+' n expr-cont doPossessiveStar # *+ default expr-cont doStar # # quant-plus Scanning a '+' quantifier. Need to look ahead to decide # between plain '+', '+?', '++' # quant-plus: '?' n expr-cont doNGPlus # *? '+' n expr-cont doPossessivePlus # *+ default expr-cont doPlus # # quant-opt Scanning a '?' quantifier. Need to look ahead to decide # between plain '?', '??', '?+' # quant-opt: '?' n expr-cont doNGOpt # ?? '+' n expr-cont doPossessiveOpt # ?+ default expr-cont doOpt # ? # # Interval scanning a '{', the opening delimiter for an interval specification # {number} or {min, max} or {min,} # interval-open: digit_char interval-lower default errorDeath doIntervalError interval-lower: digit_char n interval-lower doIntevalLowerDigit ',' n interval-upper '}' n interval-type doIntervalSame # {n} default errorDeath doIntervalError interval-upper: digit_char n interval-upper doIntervalUpperDigit '}' n interval-type default errorDeath doIntervalError interval-type: '?' n expr-cont doNGInterval # {n,m}? '+' n expr-cont doPossessiveInterval # {n,m}+ default expr-cont doInterval # {m,n} # # backslash # Backslash. Figure out which of the \thingies we have encountered. # The low level next-char function will have preprocessed # some of them already; those won't come here. backslash: 'A' n term doBackslashA 'B' n term doBackslashB 'b' n term doBackslashb 'd' n expr-quant doBackslashd 'D' n expr-quant doBackslashD 'G' n term doBackslashG 'h' n expr-quant doBackslashh 'H' n expr-quant doBackslashH 'k' n named-backref 'N' expr-quant doNamedChar # \N{NAME} named char 'p' expr-quant doProperty # \p{Lu} style property 'P' expr-quant doProperty 'R' n expr-quant doBackslashR 'Q' n term doEnterQuoteMode 'S' n expr-quant doBackslashS 's' n expr-quant doBackslashs 'v' n expr-quant doBackslashv 'V' n expr-quant doBackslashV 'W' n expr-quant doBackslashW 'w' n expr-quant doBackslashw 'X' n expr-quant doBackslashX 'Z' n term doBackslashZ 'z' n term doBackslashz digit_char n expr-quant doBackRef # Will scan multiple digits eof errorDeath doEscapeError default n expr-quant doEscapedLiteralChar # named-backref Scanned \k # Leading to \k<captureName> # Failure to get the full sequence is an error. # named-backref: '<' n named-backref-2 doBeginNamedBackRef default errorDeath doBadNamedCapture named-backref-2: ascii_letter n named-backref-3 doContinueNamedBackRef default errorDeath doBadNamedCapture named-backref-3: ascii_letter n named-backref-3 doContinueNamedBackRef digit_char n named-backref-3 doContinueNamedBackRef '>' n expr-quant doCompleteNamedBackRef default errorDeath doBadNamedCapture # # [set expression] parsing, # All states involved in parsing set expressions have names beginning with "set-" # set-open: '^' n set-open2 doSetNegate ':' set-posix doSetPosixProp default set-open2 set-open2: ']' n set-after-lit doSetLiteral default set-start # set-posix: # scanned a '[:' If it really is a [:property:], doSetPosixProp will have # moved the scan to the closing ']'. If it wasn't a property # expression, the scan will still be at the opening ':', which should # be interpreted as a normal set expression. set-posix: ']' n pop doSetEnd ':' set-start default errorDeath doRuleError # should not be possible. # # set-start after the [ and special case leading characters (^ and/or ]) but before # everything else. A '-' is literal at this point. # set-start: ']' n pop doSetEnd '[' n set-open ^set-after-set doSetBeginUnion '\' n set-escape '-' n set-start-dash '&' n set-start-amp default n set-after-lit doSetLiteral # set-start-dash Turn "[--" into a syntax error. # "[-x" is good, - and x are literals. # set-start-dash: '-' errorDeath doRuleError default set-after-lit doSetAddDash # set-start-amp Turn "[&&" into a syntax error. # "[&x" is good, & and x are literals. # set-start-amp: '&' errorDeath doRuleError default set-after-lit doSetAddAmp # # set-after-lit The last thing scanned was a literal character within a set. # Can be followed by anything. Single '-' or '&' are # literals in this context, not operators. set-after-lit: ']' n pop doSetEnd '[' n set-open ^set-after-set doSetBeginUnion '-' n set-lit-dash '&' n set-lit-amp '\' n set-escape eof errorDeath doSetNoCloseError default n set-after-lit doSetLiteral set-after-set: ']' n pop doSetEnd '[' n set-open ^set-after-set doSetBeginUnion '-' n set-set-dash '&' n set-set-amp '\' n set-escape eof errorDeath doSetNoCloseError default n set-after-lit doSetLiteral set-after-range: ']' n pop doSetEnd '[' n set-open ^set-after-set doSetBeginUnion '-' n set-range-dash '&' n set-range-amp '\' n set-escape eof errorDeath doSetNoCloseError default n set-after-lit doSetLiteral # set-after-op # After a -- or && # It is an error to close a set at this point. # set-after-op: '[' n set-open ^set-after-set doSetBeginUnion ']' errorDeath doSetOpError '\' n set-escape default n set-after-lit doSetLiteral # # set-set-amp # Have scanned [[set]& # Could be a '&' intersection operator, if a set follows. # Could be the start of a '&&' operator. # Otherewise is a literal. set-set-amp: '[' n set-open ^set-after-set doSetBeginIntersection1 '&' n set-after-op doSetIntersection2 default set-after-lit doSetAddAmp # set-lit-amp Have scanned "[literals&" # Could be a start of "&&" operator or a literal # In [abc&[def]], the '&' is a literal # set-lit-amp: '&' n set-after-op doSetIntersection2 default set-after-lit doSetAddAmp # # set-set-dash # Have scanned [set]- # Could be a '-' difference operator, if a [set] follows. # Could be the start of a '--' operator. # Otherewise is a literal. set-set-dash: '[' n set-open ^set-after-set doSetBeginDifference1 '-' n set-after-op doSetDifference2 default set-after-lit doSetAddDash # # set-range-dash # scanned a-b- or \w- # any set or range like item where the trailing single '-' should # be literal, not a set difference operation. # A trailing "--" is still a difference operator. set-range-dash: '-' n set-after-op doSetDifference2 default set-after-lit doSetAddDash set-range-amp: '&' n set-after-op doSetIntersection2 default set-after-lit doSetAddAmp # set-lit-dash # Have scanned "[literals-" Could be a range or a -- operator or a literal # In [abc-[def]], the '-' is a literal (confirmed with a Java test) # [abc-\p{xx} the '-' is an error # [abc-] the '-' is a literal # [ab-xy] the '-' is a range # set-lit-dash: '-' n set-after-op doSetDifference2 '[' set-after-lit doSetAddDash ']' set-after-lit doSetAddDash '\' n set-lit-dash-escape default n set-after-range doSetRange # set-lit-dash-escape # # scanned "[literal-\" # Could be a range, if the \ introduces an escaped literal char or a named char. # Otherwise it is an error. # set-lit-dash-escape: 's' errorDeath doSetOpError 'S' errorDeath doSetOpError 'w' errorDeath doSetOpError 'W' errorDeath doSetOpError 'd' errorDeath doSetOpError 'D' errorDeath doSetOpError 'N' set-after-range doSetNamedRange default n set-after-range doSetRange # # set-escape # Common back-slash escape processing within set expressions # set-escape: 'p' set-after-set doSetProp 'P' set-after-set doSetProp 'N' set-after-lit doSetNamedChar 's' n set-after-range doSetBackslash_s 'S' n set-after-range doSetBackslash_S 'w' n set-after-range doSetBackslash_w 'W' n set-after-range doSetBackslash_W 'd' n set-after-range doSetBackslash_d 'D' n set-after-range doSetBackslash_D 'h' n set-after-range doSetBackslash_h 'H' n set-after-range doSetBackslash_H 'v' n set-after-range doSetBackslash_v 'V' n set-after-range doSetBackslash_V default n set-after-lit doSetLiteralEscaped # # set-finish # Have just encountered the final ']' that completes a [set], and # arrived here via a pop. From here, we exit the set parsing world, and go # back to generic regular expression parsing. # set-finish: default expr-quant doSetFinish # # errorDeath. This state is specified as the next state whenever a syntax error # in the source rules is detected. Barring bugs, the state machine will never # actually get here, but will stop because of the action associated with the error. # But, just in case, this state asks the state machine to exit. errorDeath: default n errorDeath doExit