Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2005,2008 International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  rbbidata.h
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   RBBI data formats  Includes
     14 *
     15 *                          Structs that describes the format of the Binary RBBI data,
     16 *                          as it is stored in ICU's data file.
     17 *
     18 *      RBBIDataWrapper  -  Instances of this class sit between the
     19 *                          raw data structs and the RulesBasedBreakIterator objects
     20 *                          that are created by applications.  The wrapper class
     21 *                          provides reference counting for the underlying data,
     22 *                          and direct pointers to data that would not otherwise
     23 *                          be accessible without ugly pointer arithmetic.  The
     24 *                          wrapper does not attempt to provide any higher level
     25 *                          abstractions for the data itself.
     26 *
     27 *                          There will be only one instance of RBBIDataWrapper for any
     28 *                          set of RBBI run time data being shared by instances
     29 *                          (clones) of RulesBasedBreakIterator.
     30 */
     31 
     32 #ifndef __RBBIDATA_H__
     33 #define __RBBIDATA_H__
     34 
     35 #include "unicode/utypes.h"
     36 #include "unicode/udata.h"
     37 #include "udataswp.h"
     38 
     39 /**
     40  * Swap RBBI data. See udataswp.h.
     41  * @internal
     42  */
     43 U_CAPI int32_t U_EXPORT2
     44 ubrk_swap(const UDataSwapper *ds,
     45           const void *inData, int32_t length, void *outData,
     46           UErrorCode *pErrorCode);
     47 
     48 #ifdef XP_CPLUSPLUS
     49 
     50 #include "unicode/uobject.h"
     51 #include "unicode/unistr.h"
     52 #include "utrie.h"
     53 
     54 U_NAMESPACE_BEGIN
     55 
     56 /*
     57  *   The following structs map exactly onto the raw data from ICU common data file.
     58  */
     59 struct RBBIDataHeader {
     60     uint32_t         fMagic;           /*  == 0xbla0                                               */
     61     uint8_t          fFormatVersion[4]; /* Data Format.  Same as the value in struct UDataInfo      */
     62                                        /*   if there is one associated with this data.             */
     63                                        /*     (version originates in rbbi, is copied to UDataInfo) */
     64                                        /*   For ICU 3.2 and earlier, this field was                */
     65                                        /*       uint32_t  fVersion                                 */
     66                                        /*   with a value of 1.                                     */
     67     uint32_t         fLength;          /*  Total length in bytes of this RBBI Data,                */
     68                                        /*      including all sections, not just the header.        */
     69     uint32_t         fCatCount;        /*  Number of character categories.                         */
     70 
     71     /*                                                                        */
     72     /*  Offsets and sizes of each of the subsections within the RBBI data.    */
     73     /*  All offsets are bytes from the start of the RBBIDataHeader.           */
     74     /*  All sizes are in bytes.                                               */
     75     /*                                                                        */
     76     uint32_t         fFTable;         /*  forward state transition table. */
     77     uint32_t         fFTableLen;
     78     uint32_t         fRTable;         /*  Offset to the reverse state transition table. */
     79     uint32_t         fRTableLen;
     80     uint32_t         fSFTable;        /*  safe point forward transition table */
     81     uint32_t         fSFTableLen;
     82     uint32_t         fSRTable;        /*  safe point reverse transition table */
     83     uint32_t         fSRTableLen;
     84     uint32_t         fTrie;           /*  Offset to Trie data for character categories */
     85     uint32_t         fTrieLen;
     86     uint32_t         fRuleSource;     /*  Offset to the source for for the break */
     87     uint32_t         fRuleSourceLen;  /*    rules.  Stored UChar *. */
     88     uint32_t         fStatusTable;    /* Offset to the table of rule status values */
     89     uint32_t         fStatusTableLen;
     90 
     91     uint32_t         fReserved[6];    /*  Reserved for expansion */
     92 
     93 };
     94 
     95 
     96 
     97 struct  RBBIStateTableRow {
     98     int16_t          fAccepting;    /*  Non-zero if this row is for an accepting state.   */
     99                                     /*  Value 0: not an accepting state.                  */
    100                                     /*       -1: Unconditional Accepting state.           */
    101                                     /*    positive:  Look-ahead match has completed.      */
    102                                     /*           Actual boundary position happened earlier */
    103                                     /*           Value here == fLookAhead in earlier      */
    104                                     /*              state, at actual boundary pos.        */
    105     int16_t          fLookAhead;    /*  Non-zero if this row is for a state that          */
    106                                     /*    corresponds to a '/' in the rule source.        */
    107                                     /*    Value is the same as the fAccepting             */
    108                                     /*      value for the rule (which will appear         */
    109                                     /*      in a different state.                         */
    110     int16_t          fTagIdx;       /*  Non-zero if this row covers a {tagged} position   */
    111                                     /*     from a rule.  Value is the index in the        */
    112                                     /*     StatusTable of the set of matching             */
    113                                     /*     tags (rule status values)                      */
    114     int16_t          fReserved;
    115     uint16_t         fNextState[2]; /*  Next State, indexed by char category.             */
    116                                     /*    Array Size is fNumCols from the                 */
    117                                     /*    state table header.                             */
    118                                     /*    CAUTION:  see RBBITableBuilder::getTableSize()  */
    119                                     /*              before changing anything here.        */
    120 };
    121 
    122 
    123 struct RBBIStateTable {
    124     uint32_t         fNumStates;    /*  Number of states.                                 */
    125     uint32_t         fRowLen;       /*  Length of a state table row, in bytes.            */
    126     uint32_t         fFlags;        /*  Option Flags for this state table                 */
    127     uint32_t         fReserved;     /*  reserved                                          */
    128     char             fTableData[4]; /*  First RBBIStateTableRow begins here.              */
    129                                     /*    (making it char[] simplifies ugly address       */
    130                                     /*     arithmetic for indexing variable length rows.) */
    131 };
    132 
    133 typedef enum {
    134     RBBI_LOOKAHEAD_HARD_BREAK = 1,
    135     RBBI_BOF_REQUIRED = 2
    136 } RBBIStateTableFlags;
    137 
    138 
    139 /*                                        */
    140 /*   The reference counting wrapper class */
    141 /*                                        */
    142 class RBBIDataWrapper : public UMemory {
    143 public:
    144     enum EDontAdopt {
    145         kDontAdopt
    146     };
    147     RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
    148     RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status);
    149     RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
    150     ~RBBIDataWrapper();
    151 
    152     void                  init(const RBBIDataHeader *data, UErrorCode &status);
    153     RBBIDataWrapper      *addReference();
    154     void                  removeReference();
    155     UBool                 operator ==(const RBBIDataWrapper &other) const;
    156     int32_t               hashCode();
    157     const UnicodeString  &getRuleSourceString() const;
    158 #ifdef RBBI_DEBUG
    159     void                  printData();
    160     void                  printTable(const char *heading, const RBBIStateTable *table);
    161 #else
    162     #define printData()
    163     #define printTable(heading, table)
    164 #endif
    165 
    166     /*                                     */
    167     /*   Pointers to items within the data */
    168     /*                                     */
    169     const RBBIDataHeader     *fHeader;
    170     const RBBIStateTable     *fForwardTable;
    171     const RBBIStateTable     *fReverseTable;
    172     const RBBIStateTable     *fSafeFwdTable;
    173     const RBBIStateTable     *fSafeRevTable;
    174     const UChar              *fRuleSource;
    175     const int32_t            *fRuleStatusTable;
    176 
    177     /* number of int32_t values in the rule status table.   Used to sanity check indexing */
    178     int32_t             fStatusMaxIdx;
    179 
    180     UTrie               fTrie;
    181 
    182 private:
    183     int32_t             fRefCount;
    184     UDataMemory        *fUDataMem;
    185     UnicodeString       fRuleString;
    186     UBool               fDontFreeData;
    187 
    188     RBBIDataWrapper(const RBBIDataWrapper &other); /*  forbid copying of this class */
    189     RBBIDataWrapper &operator=(const RBBIDataWrapper &other); /*  forbid copying of this class */
    190 };
    191 
    192 
    193 
    194 U_NAMESPACE_END
    195 
    196 #endif /* C++ */
    197 
    198 #endif
    199