Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 1999-2014 International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  rbbidata.h
     11 *   encoding:   US-ASCII
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   RBBI data formats  Includes
     16 *
     17 *                          Structs that describes the format of the Binary RBBI data,
     18 *                          as it is stored in ICU's data file.
     19 *
     20 *      RBBIDataWrapper  -  Instances of this class sit between the
     21 *                          raw data structs and the RulesBasedBreakIterator objects
     22 *                          that are created by applications.  The wrapper class
     23 *                          provides reference counting for the underlying data,
     24 *                          and direct pointers to data that would not otherwise
     25 *                          be accessible without ugly pointer arithmetic.  The
     26 *                          wrapper does not attempt to provide any higher level
     27 *                          abstractions for the data itself.
     28 *
     29 *                          There will be only one instance of RBBIDataWrapper for any
     30 *                          set of RBBI run time data being shared by instances
     31 *                          (clones) of RulesBasedBreakIterator.
     32 */
     33 
     34 #ifndef __RBBIDATA_H__
     35 #define __RBBIDATA_H__
     36 
     37 #include "unicode/utypes.h"
     38 #include "unicode/udata.h"
     39 #include "udataswp.h"
     40 
     41 /**
     42  * Swap RBBI data. See udataswp.h.
     43  * @internal
     44  */
     45 U_CAPI int32_t U_EXPORT2
     46 ubrk_swap(const UDataSwapper *ds,
     47           const void *inData, int32_t length, void *outData,
     48           UErrorCode *pErrorCode);
     49 
     50 #ifdef __cplusplus
     51 
     52 #include "unicode/uobject.h"
     53 #include "unicode/unistr.h"
     54 #include "umutex.h"
     55 #include "utrie.h"
     56 
     57 U_NAMESPACE_BEGIN
     58 
     59 /*
     60  *   The following structs map exactly onto the raw data from ICU common data file.
     61  */
     62 struct RBBIDataHeader {
     63     uint32_t         fMagic;           /*  == 0xbla0                                               */
     64     uint8_t          fFormatVersion[4]; /* Data Format.  Same as the value in struct UDataInfo      */
     65                                        /*   if there is one associated with this data.             */
     66                                        /*     (version originates in rbbi, is copied to UDataInfo) */
     67                                        /*   For ICU 3.2 and earlier, this field was                */
     68                                        /*       uint32_t  fVersion                                 */
     69                                        /*   with a value of 1.                                     */
     70     uint32_t         fLength;          /*  Total length in bytes of this RBBI Data,                */
     71                                        /*      including all sections, not just the header.        */
     72     uint32_t         fCatCount;        /*  Number of character categories.                         */
     73 
     74     /*                                                                        */
     75     /*  Offsets and sizes of each of the subsections within the RBBI data.    */
     76     /*  All offsets are bytes from the start of the RBBIDataHeader.           */
     77     /*  All sizes are in bytes.                                               */
     78     /*                                                                        */
     79     uint32_t         fFTable;         /*  forward state transition table. */
     80     uint32_t         fFTableLen;
     81     uint32_t         fRTable;         /*  Offset to the reverse state transition table. */
     82     uint32_t         fRTableLen;
     83     uint32_t         fSFTable;        /*  safe point forward transition table */
     84     uint32_t         fSFTableLen;
     85     uint32_t         fSRTable;        /*  safe point reverse transition table */
     86     uint32_t         fSRTableLen;
     87     uint32_t         fTrie;           /*  Offset to Trie data for character categories */
     88     uint32_t         fTrieLen;
     89     uint32_t         fRuleSource;     /*  Offset to the source for for the break */
     90     uint32_t         fRuleSourceLen;  /*    rules.  Stored UChar *. */
     91     uint32_t         fStatusTable;    /* Offset to the table of rule status values */
     92     uint32_t         fStatusTableLen;
     93 
     94     uint32_t         fReserved[6];    /*  Reserved for expansion */
     95 
     96 };
     97 
     98 
     99 
    100 struct  RBBIStateTableRow {
    101     int16_t          fAccepting;    /*  Non-zero if this row is for an accepting state.   */
    102                                     /*  Value 0: not an accepting state.                  */
    103                                     /*       -1: Unconditional Accepting state.           */
    104                                     /*    positive:  Look-ahead match has completed.      */
    105                                     /*           Actual boundary position happened earlier */
    106                                     /*           Value here == fLookAhead in earlier      */
    107                                     /*              state, at actual boundary pos.        */
    108     int16_t          fLookAhead;    /*  Non-zero if this row is for a state that          */
    109                                     /*    corresponds to a '/' in the rule source.        */
    110                                     /*    Value is the same as the fAccepting             */
    111                                     /*      value for the rule (which will appear         */
    112                                     /*      in a different state.                         */
    113     int16_t          fTagIdx;       /*  Non-zero if this row covers a {tagged} position   */
    114                                     /*     from a rule.  Value is the index in the        */
    115                                     /*     StatusTable of the set of matching             */
    116                                     /*     tags (rule status values)                      */
    117     int16_t          fReserved;
    118     uint16_t         fNextState[2]; /*  Next State, indexed by char category.             */
    119                                     /*  This array does not have two elements             */
    120                                     /*    Array Size is actually fData->fHeader->fCatCount         */
    121                                     /*    CAUTION:  see RBBITableBuilder::getTableSize()  */
    122                                     /*              before changing anything here.        */
    123 };
    124 
    125 
    126 struct RBBIStateTable {
    127     uint32_t         fNumStates;    /*  Number of states.                                 */
    128     uint32_t         fRowLen;       /*  Length of a state table row, in bytes.            */
    129     uint32_t         fFlags;        /*  Option Flags for this state table                 */
    130     uint32_t         fReserved;     /*  reserved                                          */
    131     char             fTableData[4]; /*  First RBBIStateTableRow begins here.              */
    132                                     /*    (making it char[] simplifies ugly address       */
    133                                     /*     arithmetic for indexing variable length rows.) */
    134 };
    135 
    136 typedef enum {
    137     RBBI_LOOKAHEAD_HARD_BREAK = 1,
    138     RBBI_BOF_REQUIRED = 2
    139 } RBBIStateTableFlags;
    140 
    141 
    142 /*                                        */
    143 /*   The reference counting wrapper class */
    144 /*                                        */
    145 class RBBIDataWrapper : public UMemory {
    146 public:
    147     enum EDontAdopt {
    148         kDontAdopt
    149     };
    150     RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
    151     RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status);
    152     RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
    153     ~RBBIDataWrapper();
    154 
    155     void                  init0();
    156     void                  init(const RBBIDataHeader *data, UErrorCode &status);
    157     RBBIDataWrapper      *addReference();
    158     void                  removeReference();
    159     UBool                 operator ==(const RBBIDataWrapper &other) const;
    160     int32_t               hashCode();
    161     const UnicodeString  &getRuleSourceString() const;
    162 #ifdef RBBI_DEBUG
    163     void                  printData();
    164     void                  printTable(const char *heading, const RBBIStateTable *table);
    165 #else
    166     #define printData()
    167     #define printTable(heading, table)
    168 #endif
    169 
    170     /*                                     */
    171     /*   Pointers to items within the data */
    172     /*                                     */
    173     const RBBIDataHeader     *fHeader;
    174     const RBBIStateTable     *fForwardTable;
    175     const RBBIStateTable     *fReverseTable;
    176     const RBBIStateTable     *fSafeFwdTable;
    177     const RBBIStateTable     *fSafeRevTable;
    178     const UChar              *fRuleSource;
    179     const int32_t            *fRuleStatusTable;
    180 
    181     /* number of int32_t values in the rule status table.   Used to sanity check indexing */
    182     int32_t             fStatusMaxIdx;
    183 
    184     UTrie               fTrie;
    185 
    186 private:
    187     u_atomic_int32_t    fRefCount;
    188     UDataMemory  *fUDataMem;
    189     UnicodeString       fRuleString;
    190     UBool               fDontFreeData;
    191 
    192     RBBIDataWrapper(const RBBIDataWrapper &other); /*  forbid copying of this class */
    193     RBBIDataWrapper &operator=(const RBBIDataWrapper &other); /*  forbid copying of this class */
    194 };
    195 
    196 
    197 
    198 U_NAMESPACE_END
    199 
    200 #endif /* C++ */
    201 
    202 #endif
    203