// Copyright 2018 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef V8_TORQUE_EARLEY_PARSER_H_ #define V8_TORQUE_EARLEY_PARSER_H_ #include <map> #include <vector> #include "src/base/optional.h" #include "src/torque/contextual.h" #include "src/torque/source-positions.h" #include "src/torque/utils.h" namespace v8 { namespace internal { namespace torque { class Symbol; class Item; class ParseResultHolderBase { public: enum class TypeId; virtual ~ParseResultHolderBase() = default; template <class T> T& Cast(); template <class T> const T& Cast() const; protected: explicit ParseResultHolderBase(TypeId type_id) : type_id_(type_id) { // MSVC wrongly complains about type_id_ being an unused private field. USE(type_id_); } private: const TypeId type_id_; }; using ParseResultTypeId = ParseResultHolderBase::TypeId; template <class T> class ParseResultHolder : public ParseResultHolderBase { public: explicit ParseResultHolder(T value) : ParseResultHolderBase(id), value_(std::move(value)) {} private: V8_EXPORT_PRIVATE static const TypeId id; friend class ParseResultHolderBase; T value_; }; template <class T> T& ParseResultHolderBase::Cast() { CHECK_EQ(ParseResultHolder<T>::id, type_id_); return static_cast<ParseResultHolder<T>*>(this)->value_; } template <class T> const T& ParseResultHolderBase::Cast() const { CHECK_EQ(ParseResultHolder<T>::id, type_id_); return static_cast<const ParseResultHolder<T>*>(this)->value_; } class ParseResult { public: template <class T> explicit ParseResult(T x) : value_(new ParseResultHolder<T>(std::move(x))) {} template <class T> const T& Cast() const { return value_->Cast<T>(); } template <class T> T& Cast() { return value_->Cast<T>(); } private: std::unique_ptr<ParseResultHolderBase> value_; }; using InputPosition = const char*; struct MatchedInput { MatchedInput(InputPosition begin, InputPosition end, SourcePosition pos) : begin(begin), end(end), pos(pos) {} InputPosition begin; InputPosition end; SourcePosition pos; std::string ToString() const { return {begin, end}; } }; class ParseResultIterator { public: explicit ParseResultIterator(std::vector<ParseResult> results, MatchedInput matched_input) : results_(std::move(results)), matched_input_(matched_input) {} ~ParseResultIterator() { // Check that all parse results have been used. CHECK_EQ(results_.size(), i_); } ParseResult Next() { CHECK_LT(i_, results_.size()); return std::move(results_[i_++]); } template <class T> T NextAs() { return std::move(Next().Cast<T>()); } bool HasNext() const { return i_ < results_.size(); } const MatchedInput& matched_input() const { return matched_input_; } private: std::vector<ParseResult> results_; size_t i_ = 0; MatchedInput matched_input_; DISALLOW_COPY_AND_MOVE_AND_ASSIGN(ParseResultIterator); }; struct LexerResult { std::vector<Symbol*> token_symbols; std::vector<MatchedInput> token_contents; }; using Action = base::Optional<ParseResult> (*)(ParseResultIterator* child_results); inline base::Optional<ParseResult> DefaultAction( ParseResultIterator* child_results) { if (!child_results->HasNext()) return base::nullopt; return child_results->Next(); } // A rule of the context-free grammar. Each rule can have an action attached to // it, which is executed after the parsing is finished. class Rule final { public: explicit Rule(std::vector<Symbol*> right_hand_side, Action action = DefaultAction) : right_hand_side_(std::move(right_hand_side)), action_(action) {} Symbol* left() const { DCHECK_NOT_NULL(left_hand_side_); return left_hand_side_; } const std::vector<Symbol*>& right() const { return right_hand_side_; } void SetLeftHandSide(Symbol* left_hand_side) { DCHECK_NULL(left_hand_side_); left_hand_side_ = left_hand_side; } V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction( const Item* completed_item, const LexerResult& tokens) const; private: Symbol* left_hand_side_ = nullptr; std::vector<Symbol*> right_hand_side_; Action action_; }; // A Symbol represents a terminal or a non-terminal of the grammar. // It stores the list of rules, which have this symbol as the // left-hand side. // Terminals have an empty list of rules, they are created by the Lexer // instead of from rules. // Symbols need to reside at stable memory addresses, because the addresses are // used in the parser. class Symbol { public: Symbol() : Symbol({}) {} Symbol(std::initializer_list<Rule> rules) { *this = rules; } V8_EXPORT_PRIVATE Symbol& operator=(std::initializer_list<Rule> rules); bool IsTerminal() const { return rules_.empty(); } Rule* rule(size_t index) const { return rules_[index].get(); } size_t rule_number() const { return rules_.size(); } void AddRule(const Rule& rule) { rules_.push_back(base::make_unique<Rule>(rule)); rules_.back()->SetLeftHandSide(this); } V8_EXPORT_PRIVATE base::Optional<ParseResult> RunAction( const Item* item, const LexerResult& tokens); private: std::vector<std::unique_ptr<Rule>> rules_; // Disallow copying and moving to ensure Symbol has a stable address. DISALLOW_COPY_AND_MOVE_AND_ASSIGN(Symbol); }; // Items are the core datastructure of Earley's algorithm. // They consist of a (partially) matched rule, a marked position inside of the // right-hand side of the rule (traditionally written as a dot) and an input // range from {start} to {pos} that matches the symbols of the right-hand side // that are left of the mark. In addition, they store a child and a left-sibling // pointer to reconstruct the AST in the end. class Item { public: Item(const Rule* rule, size_t mark, size_t start, size_t pos) : rule_(rule), mark_(mark), start_(start), pos_(pos) { DCHECK_LE(mark_, right().size()); } // A complete item has the mark at the right end, which means the input range // matches the complete rule. bool IsComplete() const { DCHECK_LE(mark_, right().size()); return mark_ == right().size(); } // The symbol right after the mark is expected at {pos} for this item to // advance. Symbol* NextSymbol() const { DCHECK(!IsComplete()); DCHECK_LT(mark_, right().size()); return right()[mark_]; } // We successfully parsed NextSymbol() between {pos} and {new_pos}. // If NextSymbol() was a non-terminal, then {child} is a pointer to a // completed item for this parse. // We create a new item, which moves the mark one forward. Item Advance(size_t new_pos, const Item* child = nullptr) const { if (child) { DCHECK(child->IsComplete()); DCHECK_EQ(pos(), child->start()); DCHECK_EQ(new_pos, child->pos()); DCHECK_EQ(NextSymbol(), child->left()); } Item result(rule_, mark_ + 1, start_, new_pos); result.prev_ = this; result.child_ = child; return result; } // Collect the items representing the AST children of this completed item. std::vector<const Item*> Children() const; // The matched input separated according to the next branching AST level. std::string SplitByChildren(const LexerResult& tokens) const; // Check if {other} results in the same AST as this Item. void CheckAmbiguity(const Item& other, const LexerResult& tokens) const; MatchedInput GetMatchedInput(const LexerResult& tokens) const { return {tokens.token_contents[start_].begin, start_ == pos_ ? tokens.token_contents[start_].begin : tokens.token_contents[pos_ - 1].end, tokens.token_contents[start_].pos}; } // We exclude {prev_} and {child_} from equality and hash computations, // because they are just globally unique data associated with an item. bool operator==(const Item& other) const { return rule_ == other.rule_ && mark_ == other.mark_ && start_ == other.start_ && pos_ == other.pos_; } friend size_t hash_value(const Item& i) { return base::hash_combine(i.rule_, i.mark_, i.start_, i.pos_); } const Rule* rule() const { return rule_; } Symbol* left() const { return rule_->left(); } const std::vector<Symbol*>& right() const { return rule_->right(); } size_t pos() const { return pos_; } size_t start() const { return start_; } private: const Rule* rule_; size_t mark_; size_t start_; size_t pos_; const Item* prev_ = nullptr; const Item* child_ = nullptr; }; inline base::Optional<ParseResult> Symbol::RunAction( const Item* item, const LexerResult& tokens) { DCHECK(item->IsComplete()); DCHECK_EQ(item->left(), this); return item->rule()->RunAction(item, tokens); } V8_EXPORT_PRIVATE const Item* RunEarleyAlgorithm( Symbol* start, const LexerResult& tokens, std::unordered_set<Item, base::hash<Item>>* processed); inline base::Optional<ParseResult> ParseTokens(Symbol* start, const LexerResult& tokens) { std::unordered_set<Item, base::hash<Item>> table; const Item* final_item = RunEarleyAlgorithm(start, tokens, &table); return start->RunAction(final_item, tokens); } // The lexical syntax is dynamically defined while building the grammar by // adding patterns and keywords to the Lexer. // The term keyword here can stand for any fixed character sequence, including // operators and parentheses. // Each pattern or keyword automatically gets a terminal symbol associated with // it. These symbols form the result of the lexing. // Patterns and keywords are matched using the longest match principle. If the // longest matching pattern coincides with a keyword, the keyword symbol is // chosen instead of the pattern. // In addition, there is a single whitespace pattern which is consumed but does // not become part of the token list. class Lexer { public: // Functions to define patterns. They try to match starting from {pos}. If // successful, they return true and advance {pos}. Otherwise, {pos} stays // unchanged. using PatternFunction = bool (*)(InputPosition* pos); void SetWhitespace(PatternFunction whitespace) { match_whitespace_ = whitespace; } Symbol* Pattern(PatternFunction pattern) { return &patterns_[pattern]; } Symbol* Token(const std::string& keyword) { return &keywords_[keyword]; } V8_EXPORT_PRIVATE LexerResult RunLexer(const std::string& input); private: PatternFunction match_whitespace_ = [](InputPosition*) { return false; }; std::map<PatternFunction, Symbol> patterns_; std::map<std::string, Symbol> keywords_; Symbol* MatchToken(InputPosition* pos, InputPosition end); }; // A grammar can have a result, which is the results of the start symbol. // Grammar is intended to be subclassed, with Symbol members forming the // mutually recursive rules of the grammar. class Grammar { public: using PatternFunction = Lexer::PatternFunction; explicit Grammar(Symbol* start) : start_(start) {} base::Optional<ParseResult> Parse(const std::string& input) { LexerResult tokens = lexer().RunLexer(input); return ParseTokens(start_, tokens); } protected: Symbol* Token(const std::string& s) { return lexer_.Token(s); } Symbol* Pattern(PatternFunction pattern) { return lexer_.Pattern(pattern); } void SetWhitespace(PatternFunction ws) { lexer_.SetWhitespace(ws); } // NewSymbol() allocates a fresh symbol and stores it in the current grammar. // This is necessary to define helpers that create new symbols. Symbol* NewSymbol(std::initializer_list<Rule> rules = {}) { Symbol* result = new Symbol(rules); generated_symbols_.push_back(std::unique_ptr<Symbol>(result)); return result; } // Helper functions to define lexer patterns. If they match, they return true // and advance {pos}. Otherwise, {pos} is unchanged. V8_EXPORT_PRIVATE static bool MatchChar(int (*char_class)(int), InputPosition* pos); V8_EXPORT_PRIVATE static bool MatchChar(bool (*char_class)(char), InputPosition* pos); V8_EXPORT_PRIVATE static bool MatchAnyChar(InputPosition* pos); V8_EXPORT_PRIVATE static bool MatchString(const char* s, InputPosition* pos); // The action MatchInput() produces the input matched by the rule as // result. static base::Optional<ParseResult> YieldMatchedInput( ParseResultIterator* child_results) { return ParseResult{child_results->matched_input().ToString()}; } // Create a new symbol to parse the given sequence of symbols. // At most one of the symbols can return a result. Symbol* Sequence(std::vector<Symbol*> symbols) { return NewSymbol({Rule(std::move(symbols))}); } template <class T, T value> static base::Optional<ParseResult> YieldIntegralConstant( ParseResultIterator* child_results) { return ParseResult{value}; } template <class T> static base::Optional<ParseResult> YieldDefaultValue( ParseResultIterator* child_results) { return ParseResult{T{}}; } template <class From, class To> static base::Optional<ParseResult> CastParseResult( ParseResultIterator* child_results) { To result = std::move(child_results->NextAs<From>()); return ParseResult{std::move(result)}; } // Try to parse {s} and return the result of type {Result} casted to {T}. // Otherwise, the result is a default-constructed {T}. template <class T, class Result = T> Symbol* TryOrDefault(Symbol* s) { return NewSymbol({Rule({s}, CastParseResult<Result, T>), Rule({}, YieldDefaultValue<T>)}); } template <class T> static base::Optional<ParseResult> MakeSingletonVector( ParseResultIterator* child_results) { T x = child_results->NextAs<T>(); std::vector<T> result; result.push_back(std::move(x)); return ParseResult{std::move(result)}; } template <class T> static base::Optional<ParseResult> MakeExtendedVector( ParseResultIterator* child_results) { std::vector<T> l = child_results->NextAs<std::vector<T>>(); T x = child_results->NextAs<T>(); l.push_back(std::move(x)); return ParseResult{std::move(l)}; } // For example, NonemptyList(Token("A"), Token(",")) parses any of // A or A,A or A,A,A and so on. template <class T> Symbol* NonemptyList(Symbol* element, base::Optional<Symbol*> separator = {}) { Symbol* list = NewSymbol(); *list = {Rule({element}, MakeSingletonVector<T>), separator ? Rule({list, *separator, element}, MakeExtendedVector<T>) : Rule({list, element}, MakeExtendedVector<T>)}; return list; } template <class T> Symbol* List(Symbol* element, base::Optional<Symbol*> separator = {}) { return TryOrDefault<std::vector<T>>(NonemptyList<T>(element, separator)); } template <class T> Symbol* Optional(Symbol* x) { return TryOrDefault<base::Optional<T>, T>(x); } Symbol* CheckIf(Symbol* x) { return NewSymbol({Rule({x}, YieldIntegralConstant<bool, true>), Rule({}, YieldIntegralConstant<bool, false>)}); } Lexer& lexer() { return lexer_; } private: Lexer lexer_; std::vector<std::unique_ptr<Symbol>> generated_symbols_; Symbol* start_; }; } // namespace torque } // namespace internal } // namespace v8 #endif // V8_TORQUE_EARLEY_PARSER_H_