Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 1999-2014 International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  rbbidata.h
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   RBBI data formats  Includes
     16 *
     17 *                          Structs that describes the format of the Binary RBBI data,
     18 *                          as it is stored in ICU's data file.
     19 *
     20 *      RBBIDataWrapper  -  Instances of this class sit between the
     21 *                          raw data structs and the RulesBasedBreakIterator objects
     22 *                          that are created by applications.  The wrapper class
     23 *                          provides reference counting for the underlying data,
     24 *                          and direct pointers to data that would not otherwise
     25 *                          be accessible without ugly pointer arithmetic.  The
     26 *                          wrapper does not attempt to provide any higher level
     27 *                          abstractions for the data itself.
     28 *
     29 *                          There will be only one instance of RBBIDataWrapper for any
     30 *                          set of RBBI run time data being shared by instances
     31 *                          (clones) of RulesBasedBreakIterator.
     32 */
     33 
     34 #ifndef __RBBIDATA_H__
     35 #define __RBBIDATA_H__
     36 
     37 #include "unicode/utypes.h"
     38 #include "unicode/udata.h"
     39 #include "udataswp.h"
     40 
     41 /**
     42  * Swap RBBI data. See udataswp.h.
     43  * @internal
     44  */
     45 U_CAPI int32_t U_EXPORT2
     46 ubrk_swap(const UDataSwapper *ds,
     47           const void *inData, int32_t length, void *outData,
     48           UErrorCode *pErrorCode);
     49 
     50 #ifdef __cplusplus
     51 
     52 #include "unicode/uobject.h"
     53 #include "unicode/unistr.h"
     54 #include "unicode/uversion.h"
     55 #include "umutex.h"
     56 #include "utrie2.h"
     57 
     58 U_NAMESPACE_BEGIN
     59 
     60 // The current RBBI data format version.
     61 static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {4, 0, 0, 0};
     62 
     63 /*
     64  *   The following structs map exactly onto the raw data from ICU common data file.
     65  */
     66 struct RBBIDataHeader {
     67     uint32_t         fMagic;           /*  == 0xbla0                                               */
     68     UVersionInfo     fFormatVersion;   /* Data Format.  Same as the value in struct UDataInfo      */
     69                                        /*   if there is one associated with this data.             */
     70                                        /*     (version originates in rbbi, is copied to UDataInfo) */
     71     uint32_t         fLength;          /*  Total length in bytes of this RBBI Data,                */
     72                                        /*      including all sections, not just the header.        */
     73     uint32_t         fCatCount;        /*  Number of character categories.                         */
     74 
     75     /*                                                                        */
     76     /*  Offsets and sizes of each of the subsections within the RBBI data.    */
     77     /*  All offsets are bytes from the start of the RBBIDataHeader.           */
     78     /*  All sizes are in bytes.                                               */
     79     /*                                                                        */
     80     uint32_t         fFTable;         /*  forward state transition table. */
     81     uint32_t         fFTableLen;
     82     uint32_t         fRTable;         /*  Offset to the reverse state transition table. */
     83     uint32_t         fRTableLen;
     84     uint32_t         fSFTable;        /*  safe point forward transition table */
     85     uint32_t         fSFTableLen;
     86     uint32_t         fSRTable;        /*  safe point reverse transition table */
     87     uint32_t         fSRTableLen;
     88     uint32_t         fTrie;           /*  Offset to Trie data for character categories */
     89     uint32_t         fTrieLen;
     90     uint32_t         fRuleSource;     /*  Offset to the source for for the break */
     91     uint32_t         fRuleSourceLen;  /*    rules.  Stored UChar *. */
     92     uint32_t         fStatusTable;    /* Offset to the table of rule status values */
     93     uint32_t         fStatusTableLen;
     94 
     95     uint32_t         fReserved[6];    /*  Reserved for expansion */
     96 
     97 };
     98 
     99 
    100 
    101 struct  RBBIStateTableRow {
    102     int16_t          fAccepting;    /*  Non-zero if this row is for an accepting state.   */
    103                                     /*  Value 0: not an accepting state.                  */
    104                                     /*       -1: Unconditional Accepting state.           */
    105                                     /*    positive:  Look-ahead match has completed.      */
    106                                     /*           Actual boundary position happened earlier */
    107                                     /*           Value here == fLookAhead in earlier      */
    108                                     /*              state, at actual boundary pos.        */
    109     int16_t          fLookAhead;    /*  Non-zero if this row is for a state that          */
    110                                     /*    corresponds to a '/' in the rule source.        */
    111                                     /*    Value is the same as the fAccepting             */
    112                                     /*      value for the rule (which will appear         */
    113                                     /*      in a different state.                         */
    114     int16_t          fTagIdx;       /*  Non-zero if this row covers a {tagged} position   */
    115                                     /*     from a rule.  Value is the index in the        */
    116                                     /*     StatusTable of the set of matching             */
    117                                     /*     tags (rule status values)                      */
    118     int16_t          fReserved;
    119     uint16_t         fNextState[2]; /*  Next State, indexed by char category.             */
    120                                     /*  This array does not have two elements             */
    121                                     /*    Array Size is actually fData->fHeader->fCatCount         */
    122                                     /*    CAUTION:  see RBBITableBuilder::getTableSize()  */
    123                                     /*              before changing anything here.        */
    124 };
    125 
    126 
    127 struct RBBIStateTable {
    128     uint32_t         fNumStates;    /*  Number of states.                                 */
    129     uint32_t         fRowLen;       /*  Length of a state table row, in bytes.            */
    130     uint32_t         fFlags;        /*  Option Flags for this state table                 */
    131     uint32_t         fReserved;     /*  reserved                                          */
    132     char             fTableData[4]; /*  First RBBIStateTableRow begins here.              */
    133                                     /*    (making it char[] simplifies ugly address       */
    134                                     /*     arithmetic for indexing variable length rows.) */
    135 };
    136 
    137 typedef enum {
    138     RBBI_LOOKAHEAD_HARD_BREAK = 1,
    139     RBBI_BOF_REQUIRED = 2
    140 } RBBIStateTableFlags;
    141 
    142 
    143 /*                                        */
    144 /*   The reference counting wrapper class */
    145 /*                                        */
    146 class RBBIDataWrapper : public UMemory {
    147 public:
    148     enum EDontAdopt {
    149         kDontAdopt
    150     };
    151     RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
    152     RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status);
    153     RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
    154     ~RBBIDataWrapper();
    155 
    156     static UBool          isDataVersionAcceptable(const UVersionInfo version);
    157 
    158     void                  init0();
    159     void                  init(const RBBIDataHeader *data, UErrorCode &status);
    160     RBBIDataWrapper      *addReference();
    161     void                  removeReference();
    162     UBool                 operator ==(const RBBIDataWrapper &other) const;
    163     int32_t               hashCode();
    164     const UnicodeString  &getRuleSourceString() const;
    165 #ifdef RBBI_DEBUG
    166     void                  printData();
    167     void                  printTable(const char *heading, const RBBIStateTable *table);
    168 #else
    169     #define printData()
    170     #define printTable(heading, table)
    171 #endif
    172 
    173     /*                                     */
    174     /*   Pointers to items within the data */
    175     /*                                     */
    176     const RBBIDataHeader     *fHeader;
    177     const RBBIStateTable     *fForwardTable;
    178     const RBBIStateTable     *fReverseTable;
    179     const RBBIStateTable     *fSafeFwdTable;
    180     const RBBIStateTable     *fSafeRevTable;
    181     const UChar              *fRuleSource;
    182     const int32_t            *fRuleStatusTable;
    183 
    184     /* number of int32_t values in the rule status table.   Used to sanity check indexing */
    185     int32_t             fStatusMaxIdx;
    186 
    187     UTrie2             *fTrie;
    188 
    189 private:
    190     u_atomic_int32_t    fRefCount;
    191     UDataMemory        *fUDataMem;
    192     UnicodeString       fRuleString;
    193     UBool               fDontFreeData;
    194 
    195     RBBIDataWrapper(const RBBIDataWrapper &other); /*  forbid copying of this class */
    196     RBBIDataWrapper &operator=(const RBBIDataWrapper &other); /*  forbid copying of this class */
    197 };
    198 
    199 
    200 
    201 U_NAMESPACE_END
    202 
    203 #endif /* C++ */
    204 
    205 #endif
    206