Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  rbbisetb.h
      5 /*
      6 **********************************************************************
      7 *   Copyright (c) 2001-2005, International Business Machines
      8 *   Corporation and others.  All Rights Reserved.
      9 **********************************************************************
     10 */
     11 
     12 #ifndef RBBISETB_H
     13 #define RBBISETB_H
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_BREAK_ITERATION
     18 
     19 #include "unicode/uobject.h"
     20 #include "rbbirb.h"
     21 #include "utrie2.h"
     22 #include "uvector.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 //
     27 //  RBBISetBuilder   Derives the character categories used by the runtime RBBI engine
     28 //                   from the Unicode Sets appearing in the source  RBBI rules, and
     29 //                   creates the TRIE table used to map from Unicode to the
     30 //                   character categories.
     31 //
     32 
     33 
     34 //
     35 //  RangeDescriptor
     36 //
     37 //     Each of the non-overlapping character ranges gets one of these descriptors.
     38 //     All of them are strung together in a linked list, which is kept in order
     39 //     (by character)
     40 //
     41 class RangeDescriptor : public UMemory {
     42 public:
     43     UChar32            fStartChar;      // Start of range, unicode 32 bit value.
     44     UChar32            fEndChar;        // End of range, unicode 32 bit value.
     45     int32_t            fNum;            // runtime-mapped input value for this range.
     46     UVector           *fIncludesSets;   // vector of the the original
     47                                         //   Unicode sets that include this range.
     48                                         //    (Contains ptrs to uset nodes)
     49     RangeDescriptor   *fNext;           // Next RangeDescriptor in the linked list.
     50 
     51     RangeDescriptor(UErrorCode &status);
     52     RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
     53     ~RangeDescriptor();
     54     void split(UChar32 where, UErrorCode &status);   // Spit this range in two at "where", with
     55                                         //   where appearing in the second (higher) part.
     56     void setDictionaryFlag();           // Check whether this range appears as part of
     57                                         //   the Unicode set named "dictionary"
     58 
     59 private:
     60     RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
     61     RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
     62 };
     63 
     64 
     65 //
     66 //  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules.
     67 //
     68 //      Starting with the rules parse tree from the scanner,
     69 //
     70 //                   -  Enumerate the set of UnicodeSets that are referenced
     71 //                      by the RBBI rules.
     72 //                   -  compute a derived set of non-overlapping UnicodeSets
     73 //                      that will correspond to columns in the state table for
     74 //                      the RBBI execution engine.
     75 //                   -  construct the trie table that maps input characters
     76 //                      to set numbers in the non-overlapping set of sets.
     77 //
     78 
     79 
     80 class RBBISetBuilder : public UMemory {
     81 public:
     82     RBBISetBuilder(RBBIRuleBuilder *rb);
     83     ~RBBISetBuilder();
     84 
     85     void     build();
     86     void     addValToSets(UVector *sets,      uint32_t val);
     87     void     addValToSet (RBBINode *usetNode, uint32_t val);
     88     int32_t  getNumCharCategories() const;   // CharCategories are the same as input symbol set to the
     89                                              //    runtime state machine, which are the same as
     90                                              //    columns in the DFA state table
     91     int32_t  getTrieSize() /*const*/;        // Size in bytes of the serialized Trie.
     92     void     serializeTrie(uint8_t *where);  // write out the serialized Trie.
     93     UChar32  getFirstChar(int32_t  val) const;
     94     UBool    sawBOF() const;                 // Indicate whether any references to the {bof} pseudo
     95                                              //   character were encountered.
     96 #ifdef RBBI_DEBUG
     97     void     printSets();
     98     void     printRanges();
     99     void     printRangeGroups();
    100 #else
    101     #define printSets()
    102     #define printRanges()
    103     #define printRangeGroups()
    104 #endif
    105 
    106 private:
    107     void           numberSets();
    108 
    109     RBBIRuleBuilder       *fRB;             // The RBBI Rule Compiler that owns us.
    110     UErrorCode            *fStatus;
    111 
    112     RangeDescriptor       *fRangeList;      // Head of the linked list of RangeDescriptors
    113 
    114     UTrie2                *fTrie;           // The mapping TRIE that is the end result of processing
    115     uint32_t               fTrieSize;       //  the Unicode Sets.
    116 
    117     // Groups correspond to character categories -
    118     //       groups of ranges that are in the same original UnicodeSets.
    119     //       fGroupCount is the index of the last used group.
    120     //       fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
    121     //       State table column 0 is not used.  Column 1 is for end-of-input.
    122     //       column 2 is for group 0.  Funny counting.
    123     int32_t               fGroupCount;
    124 
    125     UBool                 fSawBOF;
    126 
    127     RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
    128     RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
    129 };
    130 
    131 
    132 
    133 U_NAMESPACE_END
    134 
    135 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    136 
    137 #endif
    138