/* * Copyright 2017 Google Inc. * * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE file. */ #include "DFA.h" #include "DFAState.h" #include "NFA.h" #include "NFAState.h" #include <algorithm> #include <climits> #include <memory> #include <unordered_map> #include <set> #include <vector> /** * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and * DFAs differ only in that an NFA allows multiple states at the same time, we can find each * possible combination of simultaneous NFA states and give this combination a label. These labelled * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time. * * As an NFA can end up in multiple accept states at the same time (for instance, the token "while" * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex * (in terms of the order in which they were added to the NFA). */ class NFAtoDFA { public: static constexpr char START_CHAR = 9; static constexpr char END_CHAR = 126; NFAtoDFA(NFA* nfa) : fNFA(*nfa) {} /** * Returns a DFA created from the NFA. */ DFA convert() { // create state 0, the "reject" state getState(DFAState::Label({})); // create a state representing being in all of the NFA's start states at once std::vector<int> startStates = fNFA.fStartStates; std::sort(startStates.begin(), startStates.end()); // this becomes state 1, our start state DFAState* start = getState(DFAState::Label(startStates)); this->scanState(start); this->computeMappings(); int stateCount = 0; for (const auto& row : fTransitions) { stateCount = std::max(stateCount, (int) row.size()); } return DFA(fCharMappings, fTransitions, fAccepts); } private: /** * Returns an existing state with the given label, or creates a new one and returns it. */ DFAState* getState(DFAState::Label label) { auto found = fStates.find(label); if (found == fStates.end()) { int id = fStates.size(); fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label)); return fStates[label].get(); } return found->second.get(); } void add(int nfaState, std::vector<int>* states) { NFAState state = fNFA.fStates[nfaState]; if (state.fKind == NFAState::kRemapped_Kind) { for (int next : state.fData) { this->add(next, states); } } else { for (int state : *states) { if (nfaState == state) { return; } } states->push_back(nfaState); } } void addTransition(char c, int start, int next) { while (fTransitions.size() <= (size_t) c) { fTransitions.push_back(std::vector<int>()); } std::vector<int>& row = fTransitions[c]; while (row.size() <= (size_t) start) { row.push_back(INVALID); } row[start] = next; } void scanState(DFAState* state) { state->fIsScanned = true; for (char c = START_CHAR; c <= END_CHAR; ++c) { std::vector<int> next; int bestAccept = INT_MAX; for (int idx : state->fLabel.fStates) { const NFAState& nfaState = fNFA.fStates[idx]; if (nfaState.accept(c)) { for (int nextState : nfaState.fNext) { if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) { bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]); } this->add(nextState, &next); } } } std::sort(next.begin(), next.end()); DFAState* nextState = this->getState(DFAState::Label(next)); this->addTransition(c, state->fId, nextState->fId); if (bestAccept != INT_MAX) { while (fAccepts.size() <= (size_t) nextState->fId) { fAccepts.push_back(INVALID); } fAccepts[nextState->fId] = bestAccept; } if (!nextState->fIsScanned) { this->scanState(nextState); } } } // collapse rows with the same transitions to a single row. This is common, as each row // represents a character and often there are many characters for which all transitions are // identical (e.g. [0-9] are treated the same way by all lexer rules) void computeMappings() { // mappings[<input row>] = <output row> std::vector<std::vector<int>*> uniques; // this could be done more efficiently, but O(n^2) is plenty fast for our purposes for (size_t i = 0; i < fTransitions.size(); ++i) { int found = -1; for (size_t j = 0; j < uniques.size(); ++j) { if (*uniques[j] == fTransitions[i]) { found = j; break; } } if (found == -1) { found = (int) uniques.size(); uniques.push_back(&fTransitions[i]); } fCharMappings.push_back(found); } std::vector<std::vector<int>> newTransitions; for (std::vector<int>* row : uniques) { newTransitions.push_back(*row); } fTransitions = newTransitions; } const NFA& fNFA; std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates; std::vector<std::vector<int>> fTransitions; std::vector<int> fCharMappings; std::vector<int> fAccepts; };