Home | History | Annotate | Download | only in common
      1 /*
      2 **********************************************************************
      3 * Copyright (c) 2002-2004, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 * Author: Alan Liu
      7 * Created: October 30 2002
      8 * Since: ICU 2.4
      9 **********************************************************************
     10 */
     11 #ifndef PROPNAME_H
     12 #define PROPNAME_H
     13 
     14 #include "unicode/utypes.h"
     15 #include "unicode/uchar.h"
     16 #include "udataswp.h"
     17 #include "uprops.h"
     18 
     19 /*
     20  * This header defines the in-memory layout of the property names data
     21  * structure representing the UCD data files PropertyAliases.txt and
     22  * PropertyValueAliases.txt.  It is used by:
     23  *   propname.cpp - reads data
     24  *   genpname     - creates data
     25  */
     26 
     27 /* low-level char * property name comparison -------------------------------- */
     28 
     29 U_CDECL_BEGIN
     30 
     31 /**
     32  * \var uprv_comparePropertyNames
     33  * Unicode property names and property value names are compared "loosely".
     34  *
     35  * UCD.html 4.0.1 says:
     36  *   For all property names, property value names, and for property values for
     37  *   Enumerated, Binary, or Catalog properties, use the following
     38  *   loose matching rule:
     39  *
     40  *   LM3. Ignore case, whitespace, underscore ('_'), and hyphens.
     41  *
     42  * This function does just that, for (char *) name strings.
     43  * It is almost identical to ucnv_compareNames() but also ignores
     44  * C0 White_Space characters (U+0009..U+000d, and U+0085 on EBCDIC).
     45  *
     46  * @internal
     47  */
     48 
     49 U_CAPI int32_t U_EXPORT2
     50 uprv_compareASCIIPropertyNames(const char *name1, const char *name2);
     51 
     52 U_CAPI int32_t U_EXPORT2
     53 uprv_compareEBCDICPropertyNames(const char *name1, const char *name2);
     54 
     55 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
     56 #   define uprv_comparePropertyNames uprv_compareASCIIPropertyNames
     57 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
     58 #   define uprv_comparePropertyNames uprv_compareEBCDICPropertyNames
     59 #else
     60 #   error U_CHARSET_FAMILY is not valid
     61 #endif
     62 
     63 U_CDECL_END
     64 
     65 /* UDataMemory structure and signatures ------------------------------------- */
     66 
     67 #define PNAME_DATA_NAME "pnames"
     68 #define PNAME_DATA_TYPE "icu"
     69 
     70 /* Fields in UDataInfo: */
     71 
     72 /* PNAME_SIG[] is encoded as numeric literals for compatibility with the HP compiler */
     73 #define PNAME_SIG_0 ((uint8_t)0x70) /* p */
     74 #define PNAME_SIG_1 ((uint8_t)0x6E) /* n */
     75 #define PNAME_SIG_2 ((uint8_t)0x61) /* a */
     76 #define PNAME_SIG_3 ((uint8_t)0x6D) /* m */
     77 
     78 #define PNAME_FORMAT_VERSION ((int8_t)1) /* formatVersion[0] */
     79 
     80 /**
     81  * Swap pnames.icu. See udataswp.h.
     82  * @internal
     83  */
     84 U_CAPI int32_t U_EXPORT2
     85 upname_swap(const UDataSwapper *ds,
     86             const void *inData, int32_t length, void *outData,
     87             UErrorCode *pErrorCode);
     88 
     89 
     90 #ifdef XP_CPLUSPLUS
     91 
     92 class Builder;
     93 
     94 U_NAMESPACE_BEGIN
     95 
     96 /**
     97  * An offset from the start of the pnames data to a contained entity.
     98  * This must be a signed value, since negative offsets are used as an
     99  * end-of-list marker.  Offsets to actual objects are non-zero.  A
    100  * zero offset indicates an absent entry; this corresponds to aliases
    101  * marked "n/a" in the original Unicode data files.
    102  */
    103 typedef int16_t Offset; /*  must be signed */
    104 
    105 #define MAX_OFFSET 0x7FFF
    106 
    107 /**
    108  * A generic value for a property or property value.  Typically an
    109  * enum from uchar.h, but sometimes a non-enum value.  It must be
    110  * large enough to accomodate the largest enum value, which as of this
    111  * writing is the largest general category mask.  Need not be signed
    112  * but may be.  Typically it doesn't matter, since the caller will
    113  * cast it to the proper type before use.  Takes the special value
    114  * UCHAR_INVALID_CODE for invalid input.
    115  */
    116 typedef int32_t EnumValue;
    117 
    118 /* ---------------------------------------------------------------------- */
    119 /*  ValueMap */
    120 
    121 /**
    122  * For any top-level property that has named values (binary and
    123  * enumerated properties), there is a ValueMap object.  This object
    124  * maps from enum values to two other maps.  One goes from value enums
    125  * to value names.  The other goes from value names to value enums.
    126  *
    127  * The value enum values may be contiguous or disjoint.  If they are
    128  * contiguous then the enumToName_offset is nonzero, and the
    129  * ncEnumToName_offset is zero.  Vice versa if the value enums are
    130  * disjoint.
    131  *
    132  * There are n of these objects, where n is the number of binary
    133  * properties + the number of enumerated properties.
    134  */
    135 struct ValueMap {
    136 
    137     /*  -- begin pnames data -- */
    138     /*  Enum=>name EnumToOffset / NonContiguousEnumToOffset objects. */
    139     /*  Exactly one of these will be nonzero. */
    140     Offset enumToName_offset;
    141     Offset ncEnumToName_offset;
    142 
    143     Offset nameToEnum_offset; /*  Name=>enum data */
    144     /*  -- end pnames data -- */
    145 };
    146 
    147 /* ---------------------------------------------------------------------- */
    148 /*  PropertyAliases class */
    149 
    150 /**
    151  * A class encapsulating access to the memory-mapped data representing
    152  * property aliases and property value aliases (pnames).  The class
    153  * MUST have no v-table and declares certain methods inline -- small
    154  * methods and methods that are called from only one point.
    155  *
    156  * The data members in this class correspond to the in-memory layout
    157  * of the header of the pnames data.
    158  */
    159 class PropertyAliases {
    160 
    161     /*  -- begin pnames data -- */
    162     /*  Enum=>name EnumToOffset object for binary and enumerated */
    163     /*  properties */
    164     Offset enumToName_offset;
    165 
    166     /*  Name=>enum data for binary & enumerated properties */
    167     Offset nameToEnum_offset;
    168 
    169     /*  Enum=>offset EnumToOffset object mapping enumerated properties */
    170     /*  to ValueMap objects */
    171     Offset enumToValue_offset;
    172 
    173     /*  The following are needed by external readers of this data. */
    174     /*  We don't use them ourselves. */
    175     int16_t total_size; /*  size in bytes excluding the udata header */
    176     Offset valueMap_offset; /*  offset to start of array */
    177     int16_t valueMap_count; /*  number of entries */
    178     Offset nameGroupPool_offset; /*  offset to start of array */
    179     int16_t nameGroupPool_count; /*  number of entries (not groups) */
    180     Offset stringPool_offset; /*  offset to start of pool */
    181     int16_t stringPool_count; /*  number of strings (not size in bytes) */
    182 
    183     /*  -- end pnames data -- */
    184 
    185     friend class ::Builder;
    186 
    187     const ValueMap* getValueMap(EnumValue prop) const;
    188 
    189     const char* chooseNameInGroup(Offset offset,
    190                                   UPropertyNameChoice choice) const;
    191 
    192  public:
    193 
    194     inline const int8_t* getPointer(Offset o) const {
    195         return ((const int8_t*) this) + o;
    196     }
    197 
    198     inline const int8_t* getPointerNull(Offset o) const {
    199         return o ? getPointer(o) : NULL;
    200     }
    201 
    202     inline const char* getPropertyName(EnumValue prop,
    203                                        UPropertyNameChoice choice) const;
    204 
    205     inline EnumValue getPropertyEnum(const char* alias) const;
    206 
    207     inline const char* getPropertyValueName(EnumValue prop, EnumValue value,
    208                                             UPropertyNameChoice choice) const;
    209 
    210     inline EnumValue getPropertyValueEnum(EnumValue prop,
    211                                           const char* alias) const;
    212 
    213     static int32_t
    214     swap(const UDataSwapper *ds,
    215          const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
    216          UErrorCode *pErrorCode);
    217 };
    218 
    219 /* ---------------------------------------------------------------------- */
    220 /*  EnumToOffset */
    221 
    222 /**
    223  * A generic map from enum values to Offsets.  The enum values must be
    224  * contiguous, from enumStart to enumLimit.  The Offset values may
    225  * point to anything.
    226  */
    227 class EnumToOffset {
    228 
    229     /*  -- begin pnames data -- */
    230     EnumValue enumStart;
    231     EnumValue enumLimit;
    232     Offset _offsetArray; /*  [array of enumLimit-enumStart] */
    233     /*  -- end pnames data -- */
    234 
    235     friend class ::Builder;
    236 
    237     Offset* getOffsetArray() {
    238         return &_offsetArray;
    239     }
    240 
    241     const Offset* getOffsetArray() const {
    242         return &_offsetArray;
    243     }
    244 
    245     static int32_t getSize(int32_t n) {
    246         return sizeof(EnumToOffset) + sizeof(Offset) * (n - 1);
    247     }
    248 
    249     int32_t getSize() {
    250         return getSize(enumLimit - enumStart);
    251     }
    252 
    253  public:
    254 
    255     Offset getOffset(EnumValue enumProbe) const {
    256         if (enumProbe < enumStart ||
    257             enumProbe >= enumLimit) {
    258             return 0; /*  not found */
    259         }
    260         const Offset* p = getOffsetArray();
    261         return p[enumProbe - enumStart];
    262     }
    263 
    264     static int32_t
    265     swap(const UDataSwapper *ds,
    266          const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
    267          uint8_t *temp, int32_t pos,
    268          UErrorCode *pErrorCode);
    269 };
    270 
    271 /* ---------------------------------------------------------------------- */
    272 /*  NonContiguousEnumToOffset */
    273 
    274 /**
    275  * A generic map from enum values to Offsets.  The enum values may be
    276  * disjoint.  If they are contiguous, an EnumToOffset should be used
    277  * instead.  The Offset values may point to anything.
    278  */
    279 class NonContiguousEnumToOffset {
    280 
    281     /*  -- begin pnames data -- */
    282     int32_t count;
    283     EnumValue _enumArray; /*  [array of count] */
    284     /*  Offset _offsetArray; // [array of count] after enumValue[count-1] */
    285     /*  -- end pnames data -- */
    286 
    287     friend class ::Builder;
    288 
    289     EnumValue* getEnumArray() {
    290         return &_enumArray;
    291     }
    292 
    293     const EnumValue* getEnumArray() const {
    294         return &_enumArray;
    295     }
    296 
    297     Offset* getOffsetArray() {
    298         return (Offset*) (getEnumArray() + count);
    299     }
    300 
    301     const Offset* getOffsetArray() const {
    302         return (Offset*) (getEnumArray() + count);
    303     }
    304 
    305     static int32_t getSize(int32_t n) {
    306         return sizeof(int32_t) + (sizeof(EnumValue) + sizeof(Offset)) * n;
    307     }
    308 
    309     int32_t getSize() {
    310         return getSize(count);
    311     }
    312 
    313  public:
    314 
    315     Offset getOffset(EnumValue enumProbe) const {
    316         const EnumValue* e = getEnumArray();
    317         const Offset* p = getOffsetArray();
    318         /*  linear search; binary later if warranted */
    319         /*  (binary is not faster for short lists) */
    320         for (int32_t i=0; i<count; ++i) {
    321             if (e[i] < enumProbe) continue;
    322             if (e[i] > enumProbe) break;
    323             return p[i];
    324         }
    325         return 0; /*  not found */
    326     }
    327 
    328     static int32_t
    329     swap(const UDataSwapper *ds,
    330          const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
    331          uint8_t *temp, int32_t pos,
    332          UErrorCode *pErrorCode);
    333 };
    334 
    335 /* ---------------------------------------------------------------------- */
    336 /*  NameToEnum */
    337 
    338 /**
    339  * A map from names to enum values.
    340  */
    341 class NameToEnum {
    342 
    343     /*  -- begin pnames data -- */
    344     int32_t count;       /*  number of entries */
    345     EnumValue _enumArray; /*  [array of count] EnumValues */
    346     /*  Offset _nameArray; // [array of count] offsets to names */
    347     /*  -- end pnames data -- */
    348 
    349     friend class ::Builder;
    350 
    351     EnumValue* getEnumArray() {
    352         return &_enumArray;
    353     }
    354 
    355     const EnumValue* getEnumArray() const {
    356         return &_enumArray;
    357     }
    358 
    359     Offset* getNameArray() {
    360         return (Offset*) (getEnumArray() + count);
    361     }
    362 
    363     const Offset* getNameArray() const {
    364         return (Offset*) (getEnumArray() + count);
    365     }
    366 
    367     static int32_t getSize(int32_t n) {
    368         return sizeof(int32_t) + (sizeof(Offset) + sizeof(EnumValue)) * n;
    369     }
    370 
    371     int32_t getSize() {
    372         return getSize(count);
    373     }
    374 
    375  public:
    376 
    377     EnumValue getEnum(const char* alias, const PropertyAliases& data) const {
    378 
    379         const Offset* n = getNameArray();
    380         const EnumValue* e = getEnumArray();
    381 
    382         /*  linear search; binary later if warranted */
    383         /*  (binary is not faster for short lists) */
    384         for (int32_t i=0; i<count; ++i) {
    385             const char* name = (const char*) data.getPointer(n[i]);
    386             int32_t c = uprv_comparePropertyNames(alias, name);
    387             if (c > 0) continue;
    388             if (c < 0) break;
    389             return e[i];
    390         }
    391 
    392         return UCHAR_INVALID_CODE;
    393     }
    394 
    395     static int32_t
    396     swap(const UDataSwapper *ds,
    397          const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
    398          uint8_t *temp, int32_t pos,
    399          UErrorCode *pErrorCode);
    400 };
    401 
    402 /*----------------------------------------------------------------------
    403  *
    404  * In-memory layout.  THIS IS NOT A STANDALONE DOCUMENT.  It goes
    405  * together with above C++ declarations and gives an overview.
    406  *
    407  * See above for definitions of Offset and EnumValue.  Also, refer to
    408  * above class declarations for the "bottom line" on data layout.
    409  *
    410  * Sizes:
    411  * '*_offset' is an Offset (see above)
    412  * 'count' members are typically int32_t (see above declarations)
    413  * 'enumArray' is an array of EnumValue (see above)
    414  * 'offsetArray' is an array of Offset (see above)
    415  * 'nameArray' is an array of Offset (see above)
    416  * 'enum*' is an EnumValue (see above)
    417  * '*Array [x n]' means that *Array has n elements
    418  *
    419  * References:
    420  * Instead of pointers, this flat data structure contains offsets.
    421  * All offsets are relative to the start of 'header'.  A notation
    422  * is used to indicate what structure each offset points to:
    423  * 'foo (>x)' the offset(s) in foo point to structure x
    424  *
    425  * Structures:
    426  * Each structure is assigned a number, except for the header,
    427  * which is called 'header'.  The numbers are not contiguous
    428  * for historical reasons.  Some structures have sub-parts
    429  * that are denoted with a letter, e.g., "5a".
    430  *
    431  * BEGIN LAYOUT
    432  * ============
    433  * header:
    434  *  enumToName_offset (>0)
    435  *  nameToEnum_offset (>2)
    436  *  enumToValue_offset (>3)
    437  *  (alignment padding build in to header)
    438  *
    439  * The header also contains the following, used by "external readers"
    440  * like ICU4J and icuswap.
    441  *
    442  *  // The following are needed by external readers of this data.
    443  *  // We don't use them ourselves.
    444  *  int16_t total_size; // size in bytes excluding the udata header
    445  *  Offset valueMap_offset; // offset to start of array
    446  *  int16_t valueMap_count; // number of entries
    447  *  Offset nameGroupPool_offset; // offset to start of array
    448  *  int16_t nameGroupPool_count; // number of entries (not groups)
    449  *  Offset stringPool_offset; // offset to start of pool
    450  *  int16_t stringPool_count; // number of strings (not size in bytes)
    451  *
    452  * 0: # NonContiguousEnumToOffset obj for props => name groups
    453  *  count
    454  *  enumArray [x count]
    455  *  offsetArray [x count] (>98)
    456  *
    457  * => pad to next 4-byte boundary
    458  *
    459  * (1: omitted -- no longer used)
    460  *
    461  * 2: # NameToEnum obj for binary & enumerated props
    462  *  count
    463  *  enumArray [x count]
    464  *  nameArray [x count] (>99)
    465  *
    466  * => pad to next 4-byte boundary
    467  *
    468  * 3: # NonContiguousEnumToOffset obj for enumerated props => ValueMaps
    469  *  count
    470  *  enumArray [x count]
    471  *  offsetArray [x count] (>4)
    472  *
    473  * => pad to next 4-byte boundary
    474  *
    475  * 4: # ValueMap array [x one for each enumerated prop i]
    476  *  enumToName_offset (>5a +2*i)   one of these two is NULL, one is not
    477  *  ncEnumToName_offset (>5b +2*i)
    478  *  nameToEnums_offset (>6 +2*i)
    479  *
    480  * => pad to next 4-byte boundary
    481  *
    482  * for each enumerated prop (either 5a or 5b):
    483  *
    484  *   5a: # EnumToOffset for enumerated prop's values => name groups
    485  *    enumStart
    486  *    enumLimit
    487  *    offsetArray [x enumLimit - enumStart] (>98)
    488  *
    489  *   => pad to next 4-byte boundary
    490  *
    491  *   5b: # NonContiguousEnumToOffset for enumerated prop's values => name groups
    492  *    count
    493  *    enumArray [x count]
    494  *    offsetArray [x count] (>98)
    495  *
    496  *   => pad to next 4-byte boundary
    497  *
    498  *   6: # NameToEnum for enumerated prop's values
    499  *    count
    500  *    enumArray [x count]
    501  *    nameArray [x count] (>99)
    502  *
    503  *   => pad to next 4-byte boundary
    504  *
    505  * 98: # name group pool {NGP}
    506  *  [array of Offset values] (>99)
    507  *
    508  * 99: # string pool {SP}
    509  *  [pool of nul-terminated char* strings]
    510  */
    511 U_NAMESPACE_END
    512 
    513 #endif /* C++ */
    514 
    515 #endif
    516