C++程序  |  166行  |  5.73 KB

/*
 * Copyright 2017 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include "DFA.h"
#include "DFAState.h"
#include "NFA.h"
#include "NFAState.h"

#include <algorithm>
#include <climits>
#include <memory>
#include <unordered_map>
#include <set>
#include <vector>

/**
 * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and
 * DFAs differ only in that an NFA allows multiple states at the same time, we can find each
 * possible combination of simultaneous NFA states and give this combination a label. These labelled
 * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time.
 *
 * As an NFA can end up in multiple accept states at the same time (for instance, the token "while"
 * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex
 * (in terms of the order in which they were added to the NFA).
 */
class NFAtoDFA {
public:
    static constexpr char START_CHAR = 9;
    static constexpr char END_CHAR = 126;

    NFAtoDFA(NFA* nfa)
    : fNFA(*nfa) {}

    /**
     * Returns a DFA created from the NFA.
     */
    DFA convert() {
        // create state 0, the "reject" state
        getState(DFAState::Label({}));
        // create a state representing being in all of the NFA's start states at once
        std::vector<int> startStates = fNFA.fStartStates;
        std::sort(startStates.begin(), startStates.end());
        // this becomes state 1, our start state
        DFAState* start = getState(DFAState::Label(startStates));
        this->scanState(start);

        this->computeMappings();

        int stateCount = 0;
        for (const auto& row : fTransitions) {
            stateCount = std::max(stateCount, (int) row.size());
        }
        return DFA(fCharMappings, fTransitions, fAccepts);
    }

private:
    /**
     * Returns an existing state with the given label, or creates a new one and returns it.
     */
    DFAState* getState(DFAState::Label label) {
        auto found = fStates.find(label);
        if (found == fStates.end()) {
            int id = fStates.size();
            fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label));
            return fStates[label].get();
        }
        return found->second.get();
    }

    void add(int nfaState, std::vector<int>* states) {
        NFAState state = fNFA.fStates[nfaState];
        if (state.fKind == NFAState::kRemapped_Kind) {
            for (int next : state.fData) {
                this->add(next, states);
            }
        } else {
            for (int state : *states) {
                if (nfaState == state) {
                    return;
                }
            }
            states->push_back(nfaState);
        }
    }

    void addTransition(char c, int start, int next) {
        while (fTransitions.size() <= (size_t) c) {
            fTransitions.push_back(std::vector<int>());
        }
        std::vector<int>& row = fTransitions[c];
        while (row.size() <= (size_t) start) {
            row.push_back(INVALID);
        }
        row[start] = next;
    }

    void scanState(DFAState* state) {
        state->fIsScanned = true;
        for (char c = START_CHAR; c <= END_CHAR; ++c) {
            std::vector<int> next;
            int bestAccept = INT_MAX;
            for (int idx : state->fLabel.fStates) {
                const NFAState& nfaState = fNFA.fStates[idx];
                if (nfaState.accept(c)) {
                    for (int nextState : nfaState.fNext) {
                        if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) {
                            bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]);
                        }
                        this->add(nextState, &next);
                    }
                }
            }
            std::sort(next.begin(), next.end());
            DFAState* nextState = this->getState(DFAState::Label(next));
            this->addTransition(c, state->fId, nextState->fId);
            if (bestAccept != INT_MAX) {
                while (fAccepts.size() <= (size_t) nextState->fId) {
                    fAccepts.push_back(INVALID);
                }
                fAccepts[nextState->fId] = bestAccept;
            }
            if (!nextState->fIsScanned) {
                this->scanState(nextState);
            }
        }
    }

    // collapse rows with the same transitions to a single row. This is common, as each row
    // represents a character and often there are many characters for which all transitions are
    // identical (e.g. [0-9] are treated the same way by all lexer rules)
    void computeMappings() {
        // mappings[<input row>] = <output row>
        std::vector<std::vector<int>*> uniques;
        // this could be done more efficiently, but O(n^2) is plenty fast for our purposes
        for (size_t i = 0; i < fTransitions.size(); ++i) {
            int found = -1;
            for (size_t j = 0; j < uniques.size(); ++j) {
                if (*uniques[j] == fTransitions[i]) {
                    found = j;
                    break;
                }
            }
            if (found == -1) {
                found = (int) uniques.size();
                uniques.push_back(&fTransitions[i]);
            }
            fCharMappings.push_back(found);
        }
        std::vector<std::vector<int>> newTransitions;
        for (std::vector<int>* row : uniques) {
            newTransitions.push_back(*row);
        }
        fTransitions = newTransitions;
    }

    const NFA& fNFA;
    std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates;
    std::vector<std::vector<int>> fTransitions;
    std::vector<int> fCharMappings;
    std::vector<int> fAccepts;
};