Home | History | Annotate | Download | only in common
      1 //
      2 //  rbbisetb.h
      3 /*
      4 **********************************************************************
      5 *   Copyright (c) 2001-2005, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 */
      9 
     10 #ifndef RBBISETB_H
     11 #define RBBISETB_H
     12 
     13 #include "unicode/utypes.h"
     14 #include "unicode/uobject.h"
     15 #include "rbbirb.h"
     16 #include "uvector.h"
     17 
     18 struct  UNewTrie;
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 //
     23 //  RBBISetBuilder   Derives the character categories used by the runtime RBBI engine
     24 //                   from the Unicode Sets appearing in the source  RBBI rules, and
     25 //                   creates the TRIE table used to map from Unicode to the
     26 //                   character categories.
     27 //
     28 
     29 
     30 //
     31 //  RangeDescriptor
     32 //
     33 //     Each of the non-overlapping character ranges gets one of these descriptors.
     34 //     All of them are strung together in a linked list, which is kept in order
     35 //     (by character)
     36 //
     37 class RangeDescriptor : public UMemory {
     38 public:
     39     UChar32            fStartChar;      // Start of range, unicode 32 bit value.
     40     UChar32            fEndChar;        // End of range, unicode 32 bit value.
     41     int32_t            fNum;            // runtime-mapped input value for this range.
     42     UVector           *fIncludesSets;   // vector of the the original
     43                                         //   Unicode sets that include this range.
     44                                         //    (Contains ptrs to uset nodes)
     45     RangeDescriptor   *fNext;           // Next RangeDescriptor in the linked list.
     46 
     47     RangeDescriptor(UErrorCode &status);
     48     RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
     49     ~RangeDescriptor();
     50     void split(UChar32 where, UErrorCode &status);   // Spit this range in two at "where", with
     51                                         //   where appearing in the second (higher) part.
     52     void setDictionaryFlag();           // Check whether this range appears as part of
     53                                         //   the Unicode set named "dictionary"
     54 
     55 private:
     56     RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
     57     RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
     58 };
     59 
     60 
     61 //
     62 //  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules.
     63 //
     64 //      Starting with the rules parse tree from the scanner,
     65 //
     66 //                   -  Enumerate the set of UnicodeSets that are referenced
     67 //                      by the RBBI rules.
     68 //                   -  compute a derived set of non-overlapping UnicodeSets
     69 //                      that will correspond to columns in the state table for
     70 //                      the RBBI execution engine.
     71 //                   -  construct the trie table that maps input characters
     72 //                      to set numbers in the non-overlapping set of sets.
     73 //
     74 
     75 
     76 class RBBISetBuilder : public UMemory {
     77 public:
     78     RBBISetBuilder(RBBIRuleBuilder *rb);
     79     ~RBBISetBuilder();
     80 
     81     void     build();
     82     void     addValToSets(UVector *sets,      uint32_t val);
     83     void     addValToSet (RBBINode *usetNode, uint32_t val);
     84     int32_t  getNumCharCategories() const;   // CharCategories are the same as input symbol set to the
     85                                              //    runtime state machine, which are the same as
     86                                              //    columns in the DFA state table
     87     int32_t  getTrieSize() /*const*/;        // Size in bytes of the serialized Trie.
     88     void     serializeTrie(uint8_t *where);  // write out the serialized Trie.
     89     UChar32  getFirstChar(int32_t  val) const;
     90     UBool    sawBOF() const;                 // Indicate whether any references to the {bof} pseudo
     91                                              //   character were encountered.
     92 #ifdef RBBI_DEBUG
     93     void     printSets();
     94     void     printRanges();
     95     void     printRangeGroups();
     96 #else
     97     #define printSets()
     98     #define printRanges()
     99     #define printRangeGroups()
    100 #endif
    101 
    102 private:
    103     void           numberSets();
    104 
    105     RBBIRuleBuilder       *fRB;             // The RBBI Rule Compiler that owns us.
    106     UErrorCode            *fStatus;
    107 
    108     RangeDescriptor       *fRangeList;      // Head of the linked list of RangeDescriptors
    109 
    110     UNewTrie              *fTrie;           // The mapping TRIE that is the end result of processing
    111     uint32_t              fTrieSize;        //  the Unicode Sets.
    112 
    113     // Groups correspond to character categories -
    114     //       groups of ranges that are in the same original UnicodeSets.
    115     //       fGroupCount is the index of the last used group.
    116     //       fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
    117     //       State table column 0 is not used.  Column 1 is for end-of-input.
    118     //       column 2 is for group 0.  Funny counting.
    119     int32_t               fGroupCount;
    120 
    121     UBool                 fSawBOF;
    122 
    123     RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
    124     RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
    125 };
    126 
    127 
    128 
    129 U_NAMESPACE_END
    130 #endif
    131