Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2014 International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  rbbidata.h
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   RBBI data formats  Includes
     14 *
     15 *                          Structs that describes the format of the Binary RBBI data,
     16 *                          as it is stored in ICU's data file.
     17 *
     18 *      RBBIDataWrapper  -  Instances of this class sit between the
     19 *                          raw data structs and the RulesBasedBreakIterator objects
     20 *                          that are created by applications.  The wrapper class
     21 *                          provides reference counting for the underlying data,
     22 *                          and direct pointers to data that would not otherwise
     23 *                          be accessible without ugly pointer arithmetic.  The
     24 *                          wrapper does not attempt to provide any higher level
     25 *                          abstractions for the data itself.
     26 *
     27 *                          There will be only one instance of RBBIDataWrapper for any
     28 *                          set of RBBI run time data being shared by instances
     29 *                          (clones) of RulesBasedBreakIterator.
     30 */
     31 
     32 #ifndef __RBBIDATA_H__
     33 #define __RBBIDATA_H__
     34 
     35 #include "unicode/utypes.h"
     36 #include "unicode/udata.h"
     37 #include "udataswp.h"
     38 
     39 /**
     40  * Swap RBBI data. See udataswp.h.
     41  * @internal
     42  */
     43 U_CAPI int32_t U_EXPORT2
     44 ubrk_swap(const UDataSwapper *ds,
     45           const void *inData, int32_t length, void *outData,
     46           UErrorCode *pErrorCode);
     47 
     48 #ifdef __cplusplus
     49 
     50 #include "unicode/uobject.h"
     51 #include "unicode/unistr.h"
     52 #include "umutex.h"
     53 #include "utrie.h"
     54 
     55 U_NAMESPACE_BEGIN
     56 
     57 /*
     58  *   The following structs map exactly onto the raw data from ICU common data file.
     59  */
     60 struct RBBIDataHeader {
     61     uint32_t         fMagic;           /*  == 0xbla0                                               */
     62     uint8_t          fFormatVersion[4]; /* Data Format.  Same as the value in struct UDataInfo      */
     63                                        /*   if there is one associated with this data.             */
     64                                        /*     (version originates in rbbi, is copied to UDataInfo) */
     65                                        /*   For ICU 3.2 and earlier, this field was                */
     66                                        /*       uint32_t  fVersion                                 */
     67                                        /*   with a value of 1.                                     */
     68     uint32_t         fLength;          /*  Total length in bytes of this RBBI Data,                */
     69                                        /*      including all sections, not just the header.        */
     70     uint32_t         fCatCount;        /*  Number of character categories.                         */
     71 
     72     /*                                                                        */
     73     /*  Offsets and sizes of each of the subsections within the RBBI data.    */
     74     /*  All offsets are bytes from the start of the RBBIDataHeader.           */
     75     /*  All sizes are in bytes.                                               */
     76     /*                                                                        */
     77     uint32_t         fFTable;         /*  forward state transition table. */
     78     uint32_t         fFTableLen;
     79     uint32_t         fRTable;         /*  Offset to the reverse state transition table. */
     80     uint32_t         fRTableLen;
     81     uint32_t         fSFTable;        /*  safe point forward transition table */
     82     uint32_t         fSFTableLen;
     83     uint32_t         fSRTable;        /*  safe point reverse transition table */
     84     uint32_t         fSRTableLen;
     85     uint32_t         fTrie;           /*  Offset to Trie data for character categories */
     86     uint32_t         fTrieLen;
     87     uint32_t         fRuleSource;     /*  Offset to the source for for the break */
     88     uint32_t         fRuleSourceLen;  /*    rules.  Stored UChar *. */
     89     uint32_t         fStatusTable;    /* Offset to the table of rule status values */
     90     uint32_t         fStatusTableLen;
     91 
     92     uint32_t         fReserved[6];    /*  Reserved for expansion */
     93 
     94 };
     95 
     96 
     97 
     98 struct  RBBIStateTableRow {
     99     int16_t          fAccepting;    /*  Non-zero if this row is for an accepting state.   */
    100                                     /*  Value 0: not an accepting state.                  */
    101                                     /*       -1: Unconditional Accepting state.           */
    102                                     /*    positive:  Look-ahead match has completed.      */
    103                                     /*           Actual boundary position happened earlier */
    104                                     /*           Value here == fLookAhead in earlier      */
    105                                     /*              state, at actual boundary pos.        */
    106     int16_t          fLookAhead;    /*  Non-zero if this row is for a state that          */
    107                                     /*    corresponds to a '/' in the rule source.        */
    108                                     /*    Value is the same as the fAccepting             */
    109                                     /*      value for the rule (which will appear         */
    110                                     /*      in a different state.                         */
    111     int16_t          fTagIdx;       /*  Non-zero if this row covers a {tagged} position   */
    112                                     /*     from a rule.  Value is the index in the        */
    113                                     /*     StatusTable of the set of matching             */
    114                                     /*     tags (rule status values)                      */
    115     int16_t          fReserved;
    116     uint16_t         fNextState[2]; /*  Next State, indexed by char category.             */
    117                                     /*  This array does not have two elements             */
    118                                     /*    Array Size is actually fData->fHeader->fCatCount         */
    119                                     /*    CAUTION:  see RBBITableBuilder::getTableSize()  */
    120                                     /*              before changing anything here.        */
    121 };
    122 
    123 
    124 struct RBBIStateTable {
    125     uint32_t         fNumStates;    /*  Number of states.                                 */
    126     uint32_t         fRowLen;       /*  Length of a state table row, in bytes.            */
    127     uint32_t         fFlags;        /*  Option Flags for this state table                 */
    128     uint32_t         fReserved;     /*  reserved                                          */
    129     char             fTableData[4]; /*  First RBBIStateTableRow begins here.              */
    130                                     /*    (making it char[] simplifies ugly address       */
    131                                     /*     arithmetic for indexing variable length rows.) */
    132 };
    133 
    134 typedef enum {
    135     RBBI_LOOKAHEAD_HARD_BREAK = 1,
    136     RBBI_BOF_REQUIRED = 2
    137 } RBBIStateTableFlags;
    138 
    139 
    140 /*                                        */
    141 /*   The reference counting wrapper class */
    142 /*                                        */
    143 class RBBIDataWrapper : public UMemory {
    144 public:
    145     enum EDontAdopt {
    146         kDontAdopt
    147     };
    148     RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
    149     RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status);
    150     RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
    151     ~RBBIDataWrapper();
    152 
    153     void                  init0();
    154     void                  init(const RBBIDataHeader *data, UErrorCode &status);
    155     RBBIDataWrapper      *addReference();
    156     void                  removeReference();
    157     UBool                 operator ==(const RBBIDataWrapper &other) const;
    158     int32_t               hashCode();
    159     const UnicodeString  &getRuleSourceString() const;
    160 #ifdef RBBI_DEBUG
    161     void                  printData();
    162     void                  printTable(const char *heading, const RBBIStateTable *table);
    163 #else
    164     #define printData()
    165     #define printTable(heading, table)
    166 #endif
    167 
    168     /*                                     */
    169     /*   Pointers to items within the data */
    170     /*                                     */
    171     const RBBIDataHeader     *fHeader;
    172     const RBBIStateTable     *fForwardTable;
    173     const RBBIStateTable     *fReverseTable;
    174     const RBBIStateTable     *fSafeFwdTable;
    175     const RBBIStateTable     *fSafeRevTable;
    176     const UChar              *fRuleSource;
    177     const int32_t            *fRuleStatusTable;
    178 
    179     /* number of int32_t values in the rule status table.   Used to sanity check indexing */
    180     int32_t             fStatusMaxIdx;
    181 
    182     UTrie               fTrie;
    183 
    184 private:
    185     u_atomic_int32_t    fRefCount;
    186     UDataMemory  *fUDataMem;
    187     UnicodeString       fRuleString;
    188     UBool               fDontFreeData;
    189 
    190     RBBIDataWrapper(const RBBIDataWrapper &other); /*  forbid copying of this class */
    191     RBBIDataWrapper &operator=(const RBBIDataWrapper &other); /*  forbid copying of this class */
    192 };
    193 
    194 
    195 
    196 U_NAMESPACE_END
    197 
    198 #endif /* C++ */
    199 
    200 #endif
    201