Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  rbbisetb.h
      5 /*
      6 **********************************************************************
      7 *   Copyright (c) 2001-2005, International Business Machines
      8 *   Corporation and others.  All Rights Reserved.
      9 **********************************************************************
     10 */
     11 
     12 #ifndef RBBISETB_H
     13 #define RBBISETB_H
     14 
     15 #include "unicode/utypes.h"
     16 #include "unicode/uobject.h"
     17 #include "rbbirb.h"
     18 #include "uvector.h"
     19 
     20 struct  UNewTrie;
     21 
     22 U_NAMESPACE_BEGIN
     23 
     24 //
     25 //  RBBISetBuilder   Derives the character categories used by the runtime RBBI engine
     26 //                   from the Unicode Sets appearing in the source  RBBI rules, and
     27 //                   creates the TRIE table used to map from Unicode to the
     28 //                   character categories.
     29 //
     30 
     31 
     32 //
     33 //  RangeDescriptor
     34 //
     35 //     Each of the non-overlapping character ranges gets one of these descriptors.
     36 //     All of them are strung together in a linked list, which is kept in order
     37 //     (by character)
     38 //
     39 class RangeDescriptor : public UMemory {
     40 public:
     41     UChar32            fStartChar;      // Start of range, unicode 32 bit value.
     42     UChar32            fEndChar;        // End of range, unicode 32 bit value.
     43     int32_t            fNum;            // runtime-mapped input value for this range.
     44     UVector           *fIncludesSets;   // vector of the the original
     45                                         //   Unicode sets that include this range.
     46                                         //    (Contains ptrs to uset nodes)
     47     RangeDescriptor   *fNext;           // Next RangeDescriptor in the linked list.
     48 
     49     RangeDescriptor(UErrorCode &status);
     50     RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
     51     ~RangeDescriptor();
     52     void split(UChar32 where, UErrorCode &status);   // Spit this range in two at "where", with
     53                                         //   where appearing in the second (higher) part.
     54     void setDictionaryFlag();           // Check whether this range appears as part of
     55                                         //   the Unicode set named "dictionary"
     56 
     57 private:
     58     RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
     59     RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
     60 };
     61 
     62 
     63 //
     64 //  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules.
     65 //
     66 //      Starting with the rules parse tree from the scanner,
     67 //
     68 //                   -  Enumerate the set of UnicodeSets that are referenced
     69 //                      by the RBBI rules.
     70 //                   -  compute a derived set of non-overlapping UnicodeSets
     71 //                      that will correspond to columns in the state table for
     72 //                      the RBBI execution engine.
     73 //                   -  construct the trie table that maps input characters
     74 //                      to set numbers in the non-overlapping set of sets.
     75 //
     76 
     77 
     78 class RBBISetBuilder : public UMemory {
     79 public:
     80     RBBISetBuilder(RBBIRuleBuilder *rb);
     81     ~RBBISetBuilder();
     82 
     83     void     build();
     84     void     addValToSets(UVector *sets,      uint32_t val);
     85     void     addValToSet (RBBINode *usetNode, uint32_t val);
     86     int32_t  getNumCharCategories() const;   // CharCategories are the same as input symbol set to the
     87                                              //    runtime state machine, which are the same as
     88                                              //    columns in the DFA state table
     89     int32_t  getTrieSize() /*const*/;        // Size in bytes of the serialized Trie.
     90     void     serializeTrie(uint8_t *where);  // write out the serialized Trie.
     91     UChar32  getFirstChar(int32_t  val) const;
     92     UBool    sawBOF() const;                 // Indicate whether any references to the {bof} pseudo
     93                                              //   character were encountered.
     94 #ifdef RBBI_DEBUG
     95     void     printSets();
     96     void     printRanges();
     97     void     printRangeGroups();
     98 #else
     99     #define printSets()
    100     #define printRanges()
    101     #define printRangeGroups()
    102 #endif
    103 
    104 private:
    105     void           numberSets();
    106 
    107     RBBIRuleBuilder       *fRB;             // The RBBI Rule Compiler that owns us.
    108     UErrorCode            *fStatus;
    109 
    110     RangeDescriptor       *fRangeList;      // Head of the linked list of RangeDescriptors
    111 
    112     UNewTrie              *fTrie;           // The mapping TRIE that is the end result of processing
    113     uint32_t              fTrieSize;        //  the Unicode Sets.
    114 
    115     // Groups correspond to character categories -
    116     //       groups of ranges that are in the same original UnicodeSets.
    117     //       fGroupCount is the index of the last used group.
    118     //       fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
    119     //       State table column 0 is not used.  Column 1 is for end-of-input.
    120     //       column 2 is for group 0.  Funny counting.
    121     int32_t               fGroupCount;
    122 
    123     UBool                 fSawBOF;
    124 
    125     RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
    126     RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
    127 };
    128 
    129 
    130 
    131 U_NAMESPACE_END
    132 #endif
    133