Home | History | Annotate | Download | only in lex
      1 /*
      2  * Copyright 2017 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include "DFA.h"
      9 #include "DFAState.h"
     10 #include "NFA.h"
     11 #include "NFAState.h"
     12 
     13 #include <algorithm>
     14 #include <climits>
     15 #include <memory>
     16 #include <unordered_map>
     17 #include <set>
     18 #include <vector>
     19 
     20 /**
     21  * Converts a nondeterministic finite automaton to a deterministic finite automaton. Since NFAs and
     22  * DFAs differ only in that an NFA allows multiple states at the same time, we can find each
     23  * possible combination of simultaneous NFA states and give this combination a label. These labelled
     24  * nodes are our DFA nodes, since we can only be in one such unique set of NFA states at a time.
     25  *
     26  * As an NFA can end up in multiple accept states at the same time (for instance, the token "while"
     27  * is valid for both WHILE and IDENTIFIER), we disambiguate by preferring the first matching regex
     28  * (in terms of the order in which they were added to the NFA).
     29  */
     30 class NFAtoDFA {
     31 public:
     32     static constexpr char START_CHAR = 9;
     33     static constexpr char END_CHAR = 126;
     34 
     35     NFAtoDFA(NFA* nfa)
     36     : fNFA(*nfa) {}
     37 
     38     /**
     39      * Returns a DFA created from the NFA.
     40      */
     41     DFA convert() {
     42         // create state 0, the "reject" state
     43         getState(DFAState::Label({}));
     44         // create a state representing being in all of the NFA's start states at once
     45         std::vector<int> startStates = fNFA.fStartStates;
     46         std::sort(startStates.begin(), startStates.end());
     47         // this becomes state 1, our start state
     48         DFAState* start = getState(DFAState::Label(startStates));
     49         this->scanState(start);
     50 
     51         this->computeMappings();
     52 
     53         int stateCount = 0;
     54         for (const auto& row : fTransitions) {
     55             stateCount = std::max(stateCount, (int) row.size());
     56         }
     57         return DFA(fCharMappings, fTransitions, fAccepts);
     58     }
     59 
     60 private:
     61     /**
     62      * Returns an existing state with the given label, or creates a new one and returns it.
     63      */
     64     DFAState* getState(DFAState::Label label) {
     65         auto found = fStates.find(label);
     66         if (found == fStates.end()) {
     67             int id = fStates.size();
     68             fStates[label] = std::unique_ptr<DFAState>(new DFAState(id, label));
     69             return fStates[label].get();
     70         }
     71         return found->second.get();
     72     }
     73 
     74     void add(int nfaState, std::vector<int>* states) {
     75         NFAState state = fNFA.fStates[nfaState];
     76         if (state.fKind == NFAState::kRemapped_Kind) {
     77             for (int next : state.fData) {
     78                 this->add(next, states);
     79             }
     80         } else {
     81             for (int state : *states) {
     82                 if (nfaState == state) {
     83                     return;
     84                 }
     85             }
     86             states->push_back(nfaState);
     87         }
     88     }
     89 
     90     void addTransition(char c, int start, int next) {
     91         while (fTransitions.size() <= (size_t) c) {
     92             fTransitions.push_back(std::vector<int>());
     93         }
     94         std::vector<int>& row = fTransitions[c];
     95         while (row.size() <= (size_t) start) {
     96             row.push_back(INVALID);
     97         }
     98         row[start] = next;
     99     }
    100 
    101     void scanState(DFAState* state) {
    102         state->fIsScanned = true;
    103         for (char c = START_CHAR; c <= END_CHAR; ++c) {
    104             std::vector<int> next;
    105             int bestAccept = INT_MAX;
    106             for (int idx : state->fLabel.fStates) {
    107                 const NFAState& nfaState = fNFA.fStates[idx];
    108                 if (nfaState.accept(c)) {
    109                     for (int nextState : nfaState.fNext) {
    110                         if (fNFA.fStates[nextState].fKind == NFAState::kAccept_Kind) {
    111                             bestAccept = std::min(bestAccept, fNFA.fStates[nextState].fData[0]);
    112                         }
    113                         this->add(nextState, &next);
    114                     }
    115                 }
    116             }
    117             std::sort(next.begin(), next.end());
    118             DFAState* nextState = this->getState(DFAState::Label(next));
    119             this->addTransition(c, state->fId, nextState->fId);
    120             if (bestAccept != INT_MAX) {
    121                 while (fAccepts.size() <= (size_t) nextState->fId) {
    122                     fAccepts.push_back(INVALID);
    123                 }
    124                 fAccepts[nextState->fId] = bestAccept;
    125             }
    126             if (!nextState->fIsScanned) {
    127                 this->scanState(nextState);
    128             }
    129         }
    130     }
    131 
    132     // collapse rows with the same transitions to a single row. This is common, as each row
    133     // represents a character and often there are many characters for which all transitions are
    134     // identical (e.g. [0-9] are treated the same way by all lexer rules)
    135     void computeMappings() {
    136         // mappings[<input row>] = <output row>
    137         std::vector<std::vector<int>*> uniques;
    138         // this could be done more efficiently, but O(n^2) is plenty fast for our purposes
    139         for (size_t i = 0; i < fTransitions.size(); ++i) {
    140             int found = -1;
    141             for (size_t j = 0; j < uniques.size(); ++j) {
    142                 if (*uniques[j] == fTransitions[i]) {
    143                     found = j;
    144                     break;
    145                 }
    146             }
    147             if (found == -1) {
    148                 found = (int) uniques.size();
    149                 uniques.push_back(&fTransitions[i]);
    150             }
    151             fCharMappings.push_back(found);
    152         }
    153         std::vector<std::vector<int>> newTransitions;
    154         for (std::vector<int>* row : uniques) {
    155             newTransitions.push_back(*row);
    156         }
    157         fTransitions = newTransitions;
    158     }
    159 
    160     const NFA& fNFA;
    161     std::unordered_map<DFAState::Label, std::unique_ptr<DFAState>> fStates;
    162     std::vector<std::vector<int>> fTransitions;
    163     std::vector<int> fCharMappings;
    164     std::vector<int> fAccepts;
    165 };
    166