Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  unames.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 1999oct04
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 #include "unicode/putil.h"
     19 #include "unicode/uchar.h"
     20 #include "unicode/udata.h"
     21 #include "ustr_imp.h"
     22 #include "umutex.h"
     23 #include "cmemory.h"
     24 #include "cstring.h"
     25 #include "ucln_cmn.h"
     26 #include "udataswp.h"
     27 #include "uprops.h"
     28 
     29 /* prototypes ------------------------------------------------------------- */
     30 
     31 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
     32 
     33 static const char DATA_NAME[] = "unames";
     34 static const char DATA_TYPE[] = "icu";
     35 
     36 #define GROUP_SHIFT 5
     37 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
     38 #define GROUP_MASK (LINES_PER_GROUP-1)
     39 
     40 /*
     41  * This struct was replaced by explicitly accessing equivalent
     42  * fields from triples of uint16_t.
     43  * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
     44  * which broke the assumption that sizeof(Group)==6 and that the ++ operator
     45  * would advance by 6 bytes (3 uint16_t).
     46  *
     47  * We can't just change the data structure because it's loaded from a data file,
     48  * and we don't want to make it less compact, so we changed the access code.
     49  *
     50  * For details see ICU tickets 6331 and 6008.
     51 typedef struct {
     52     uint16_t groupMSB,
     53              offsetHigh, offsetLow; /* avoid padding * /
     54 } Group;
     55  */
     56 enum {
     57     GROUP_MSB,
     58     GROUP_OFFSET_HIGH,
     59     GROUP_OFFSET_LOW,
     60     GROUP_LENGTH
     61 };
     62 
     63 /*
     64  * Get the 32-bit group offset.
     65  * @param group (const uint16_t *) pointer to a Group triple of uint16_t
     66  * @return group offset (int32_t)
     67  */
     68 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
     69 
     70 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
     71 #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
     72 
     73 typedef struct {
     74     uint32_t start, end;
     75     uint8_t type, variant;
     76     uint16_t size;
     77 } AlgorithmicRange;
     78 
     79 typedef struct {
     80     uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
     81 } UCharNames;
     82 
     83 /*
     84  * Get the groups table from a UCharNames struct.
     85  * The groups table consists of one uint16_t groupCount followed by
     86  * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
     87  * and the comment for the old struct Group above.
     88  *
     89  * @param names (const UCharNames *) pointer to the UCharNames indexes
     90  * @return (const uint16_t *) pointer to the groups table
     91  */
     92 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
     93 
     94 typedef struct {
     95     const char *otherName;
     96     UChar32 code;
     97 } FindName;
     98 
     99 #define DO_FIND_NAME NULL
    100 
    101 static UDataMemory *uCharNamesData=NULL;
    102 static UCharNames *uCharNames=NULL;
    103 static UErrorCode gLoadErrorCode=U_ZERO_ERROR;
    104 
    105 /*
    106  * Maximum length of character names (regular & 1.0).
    107  */
    108 static int32_t gMaxNameLength=0;
    109 
    110 /*
    111  * Set of chars used in character names (regular & 1.0).
    112  * Chars are platform-dependent (can be EBCDIC).
    113  */
    114 static uint32_t gNameSet[8]={ 0 };
    115 
    116 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
    117 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
    118 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
    119 
    120 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
    121 
    122 static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
    123     "unassigned",
    124     "uppercase letter",
    125     "lowercase letter",
    126     "titlecase letter",
    127     "modifier letter",
    128     "other letter",
    129     "non spacing mark",
    130     "enclosing mark",
    131     "combining spacing mark",
    132     "decimal digit number",
    133     "letter number",
    134     "other number",
    135     "space separator",
    136     "line separator",
    137     "paragraph separator",
    138     "control",
    139     "format",
    140     "private use area",
    141     "surrogate",
    142     "dash punctuation",
    143     "start punctuation",
    144     "end punctuation",
    145     "connector punctuation",
    146     "other punctuation",
    147     "math symbol",
    148     "currency symbol",
    149     "modifier symbol",
    150     "other symbol",
    151     "initial punctuation",
    152     "final punctuation",
    153     "noncharacter",
    154     "lead surrogate",
    155     "trail surrogate"
    156 };
    157 
    158 /* implementation ----------------------------------------------------------- */
    159 
    160 static UBool U_CALLCONV unames_cleanup(void)
    161 {
    162     if(uCharNamesData) {
    163         udata_close(uCharNamesData);
    164         uCharNamesData = NULL;
    165     }
    166     if(uCharNames) {
    167         uCharNames = NULL;
    168     }
    169     gMaxNameLength=0;
    170     return TRUE;
    171 }
    172 
    173 static UBool U_CALLCONV
    174 isAcceptable(void *context,
    175              const char *type, const char *name,
    176              const UDataInfo *pInfo) {
    177     return (UBool)(
    178         pInfo->size>=20 &&
    179         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
    180         pInfo->charsetFamily==U_CHARSET_FAMILY &&
    181         pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
    182         pInfo->dataFormat[1]==0x6e &&
    183         pInfo->dataFormat[2]==0x61 &&
    184         pInfo->dataFormat[3]==0x6d &&
    185         pInfo->formatVersion[0]==1);
    186 }
    187 
    188 static UBool
    189 isDataLoaded(UErrorCode *pErrorCode) {
    190     /* load UCharNames from file if necessary */
    191     UBool isCached;
    192 
    193     /* do this because double-checked locking is broken */
    194     UMTX_CHECK(NULL, (uCharNames!=NULL), isCached);
    195 
    196     if(!isCached) {
    197         UCharNames *names;
    198         UDataMemory *data;
    199 
    200         /* check error code from previous attempt */
    201         if(U_FAILURE(gLoadErrorCode)) {
    202             *pErrorCode=gLoadErrorCode;
    203             return FALSE;
    204         }
    205 
    206         /* open the data outside the mutex block */
    207         data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
    208         if(U_FAILURE(*pErrorCode)) {
    209             gLoadErrorCode=*pErrorCode;
    210             return FALSE;
    211         }
    212 
    213         names=(UCharNames *)udata_getMemory(data);
    214 
    215         /* in the mutex block, set the data for this process */
    216         {
    217             umtx_lock(NULL);
    218             if(uCharNames==NULL) {
    219                 uCharNamesData=data;
    220                 uCharNames=names;
    221                 data=NULL;
    222                 names=NULL;
    223                 ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
    224             }
    225             umtx_unlock(NULL);
    226         }
    227 
    228         /* if a different thread set it first, then close the extra data */
    229         if(data!=NULL) {
    230             udata_close(data); /* NULL if it was set correctly */
    231         }
    232     }
    233     return TRUE;
    234 }
    235 
    236 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
    237     if((bufferLength)>0) { \
    238         *(buffer)++=c; \
    239         --(bufferLength); \
    240     } \
    241     ++(bufferPos); \
    242 }
    243 
    244 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
    245 
    246 /*
    247  * Important: expandName() and compareName() are almost the same -
    248  * apply fixes to both.
    249  *
    250  * UnicodeData.txt uses ';' as a field separator, so no
    251  * field can contain ';' as part of its contents.
    252  * In unames.dat, it is marked as token[';']==-1 only if the
    253  * semicolon is used in the data file - which is iff we
    254  * have Unicode 1.0 names or ISO comments.
    255  * So, it will be token[';']==-1 if we store U1.0 names/ISO comments
    256  * although we know that it will never be part of a name.
    257  */
    258 static uint16_t
    259 expandName(UCharNames *names,
    260            const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
    261            char *buffer, uint16_t bufferLength) {
    262     uint16_t *tokens=(uint16_t *)names+8;
    263     uint16_t token, tokenCount=*tokens++, bufferPos=0;
    264     uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
    265     uint8_t c;
    266 
    267     if(nameChoice==U_UNICODE_10_CHAR_NAME || nameChoice==U_ISO_COMMENT) {
    268         /*
    269          * skip the modern name if it is not requested _and_
    270          * if the semicolon byte value is a character, not a token number
    271          */
    272         if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
    273             while(nameLength>0) {
    274                 --nameLength;
    275                 if(*name++==';') {
    276                     break;
    277                 }
    278             }
    279             if(nameChoice==U_ISO_COMMENT) {
    280                 /* skip the Unicode 1.0 name as well to get the ISO comment */
    281                 while(nameLength>0) {
    282                     --nameLength;
    283                     if(*name++==';') {
    284                         break;
    285                     }
    286                 }
    287             }
    288         } else {
    289             /*
    290              * the semicolon byte value is a token number, therefore
    291              * only modern names are stored in unames.dat and there is no
    292              * such requested Unicode 1.0 name here
    293              */
    294             nameLength=0;
    295         }
    296     }
    297 
    298     /* write each letter directly, and write a token word per token */
    299     while(nameLength>0) {
    300         --nameLength;
    301         c=*name++;
    302 
    303         if(c>=tokenCount) {
    304             if(c!=';') {
    305                 /* implicit letter */
    306                 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
    307             } else {
    308                 /* finished */
    309                 break;
    310             }
    311         } else {
    312             token=tokens[c];
    313             if(token==(uint16_t)(-2)) {
    314                 /* this is a lead byte for a double-byte token */
    315                 token=tokens[c<<8|*name++];
    316                 --nameLength;
    317             }
    318             if(token==(uint16_t)(-1)) {
    319                 if(c!=';') {
    320                     /* explicit letter */
    321                     WRITE_CHAR(buffer, bufferLength, bufferPos, c);
    322                 } else {
    323                     /* stop, but skip the semicolon if we are seeking
    324                        extended names and there was no 2.0 name but there
    325                        is a 1.0 name. */
    326                     if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
    327                         if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
    328                             continue;
    329                         }
    330                     }
    331                     /* finished */
    332                     break;
    333                 }
    334             } else {
    335                 /* write token word */
    336                 uint8_t *tokenString=tokenStrings+token;
    337                 while((c=*tokenString++)!=0) {
    338                     WRITE_CHAR(buffer, bufferLength, bufferPos, c);
    339                 }
    340             }
    341         }
    342     }
    343 
    344     /* zero-terminate */
    345     if(bufferLength>0) {
    346         *buffer=0;
    347     }
    348 
    349     return bufferPos;
    350 }
    351 
    352 /*
    353  * compareName() is almost the same as expandName() except that it compares
    354  * the currently expanded name to an input name.
    355  * It returns the match/no match result as soon as possible.
    356  */
    357 static UBool
    358 compareName(UCharNames *names,
    359             const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
    360             const char *otherName) {
    361     uint16_t *tokens=(uint16_t *)names+8;
    362     uint16_t token, tokenCount=*tokens++;
    363     uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
    364     uint8_t c;
    365     const char *origOtherName = otherName;
    366 
    367     if(nameChoice==U_UNICODE_10_CHAR_NAME) {
    368         /*
    369          * skip the modern name if it is not requested _and_
    370          * if the semicolon byte value is a character, not a token number
    371          */
    372         if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
    373             while(nameLength>0) {
    374                 --nameLength;
    375                 if(*name++==';') {
    376                     break;
    377                 }
    378             }
    379         } else {
    380             /*
    381              * the semicolon byte value is a token number, therefore
    382              * only modern names are stored in unames.dat and there is no
    383              * such requested Unicode 1.0 name here
    384              */
    385             nameLength=0;
    386         }
    387     }
    388 
    389     /* compare each letter directly, and compare a token word per token */
    390     while(nameLength>0) {
    391         --nameLength;
    392         c=*name++;
    393 
    394         if(c>=tokenCount) {
    395             if(c!=';') {
    396                 /* implicit letter */
    397                 if((char)c!=*otherName++) {
    398                     return FALSE;
    399                 }
    400             } else {
    401                 /* finished */
    402                 break;
    403             }
    404         } else {
    405             token=tokens[c];
    406             if(token==(uint16_t)(-2)) {
    407                 /* this is a lead byte for a double-byte token */
    408                 token=tokens[c<<8|*name++];
    409                 --nameLength;
    410             }
    411             if(token==(uint16_t)(-1)) {
    412                 if(c!=';') {
    413                     /* explicit letter */
    414                     if((char)c!=*otherName++) {
    415                         return FALSE;
    416                     }
    417                 } else {
    418                     /* stop, but skip the semicolon if we are seeking
    419                        extended names and there was no 2.0 name but there
    420                        is a 1.0 name. */
    421                     if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
    422                         if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
    423                             continue;
    424                         }
    425                     }
    426                     /* finished */
    427                     break;
    428                 }
    429             } else {
    430                 /* write token word */
    431                 uint8_t *tokenString=tokenStrings+token;
    432                 while((c=*tokenString++)!=0) {
    433                     if((char)c!=*otherName++) {
    434                         return FALSE;
    435                     }
    436                 }
    437             }
    438         }
    439     }
    440 
    441     /* complete match? */
    442     return (UBool)(*otherName==0);
    443 }
    444 
    445 static uint8_t getCharCat(UChar32 cp) {
    446     uint8_t cat;
    447 
    448     if (UTF_IS_UNICODE_NONCHAR(cp)) {
    449         return U_NONCHARACTER_CODE_POINT;
    450     }
    451 
    452     if ((cat = u_charType(cp)) == U_SURROGATE) {
    453         cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
    454     }
    455 
    456     return cat;
    457 }
    458 
    459 static const char *getCharCatName(UChar32 cp) {
    460     uint8_t cat = getCharCat(cp);
    461 
    462     /* Return unknown if the table of names above is not up to
    463        date. */
    464 
    465     if (cat >= LENGTHOF(charCatNames)) {
    466         return "unknown";
    467     } else {
    468         return charCatNames[cat];
    469     }
    470 }
    471 
    472 static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
    473     const char *catname = getCharCatName(code);
    474     uint16_t length = 0;
    475 
    476     UChar32 cp;
    477     int ndigits, i;
    478 
    479     WRITE_CHAR(buffer, bufferLength, length, '<');
    480     while (catname[length - 1]) {
    481         WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
    482     }
    483     WRITE_CHAR(buffer, bufferLength, length, '-');
    484     for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
    485         ;
    486     if (ndigits < 4)
    487         ndigits = 4;
    488     for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
    489         uint8_t v = (uint8_t)(cp & 0xf);
    490         buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
    491     }
    492     buffer += ndigits;
    493     length += ndigits;
    494     WRITE_CHAR(buffer, bufferLength, length, '>');
    495 
    496     return length;
    497 }
    498 
    499 /*
    500  * getGroup() does a binary search for the group that contains the
    501  * Unicode code point "code".
    502  * The return value is always a valid Group* that may contain "code"
    503  * or else is the highest group before "code".
    504  * If the lowest group is after "code", then that one is returned.
    505  */
    506 static const uint16_t *
    507 getGroup(UCharNames *names, uint32_t code) {
    508     const uint16_t *groups=GET_GROUPS(names);
    509     uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
    510              start=0,
    511              limit=*groups++,
    512              number;
    513 
    514     /* binary search for the group of names that contains the one for code */
    515     while(start<limit-1) {
    516         number=(uint16_t)((start+limit)/2);
    517         if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
    518             limit=number;
    519         } else {
    520             start=number;
    521         }
    522     }
    523 
    524     /* return this regardless of whether it is an exact match */
    525     return groups+start*GROUP_LENGTH;
    526 }
    527 
    528 /*
    529  * expandGroupLengths() reads a block of compressed lengths of 32 strings and
    530  * expands them into offsets and lengths for each string.
    531  * Lengths are stored with a variable-width encoding in consecutive nibbles:
    532  * If a nibble<0xc, then it is the length itself (0=empty string).
    533  * If a nibble>=0xc, then it forms a length value with the following nibble.
    534  * Calculation see below.
    535  * The offsets and lengths arrays must be at least 33 (one more) long because
    536  * there is no check here at the end if the last nibble is still used.
    537  */
    538 static const uint8_t *
    539 expandGroupLengths(const uint8_t *s,
    540                    uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
    541     /* read the lengths of the 32 strings in this group and get each string's offset */
    542     uint16_t i=0, offset=0, length=0;
    543     uint8_t lengthByte;
    544 
    545     /* all 32 lengths must be read to get the offset of the first group string */
    546     while(i<LINES_PER_GROUP) {
    547         lengthByte=*s++;
    548 
    549         /* read even nibble - MSBs of lengthByte */
    550         if(length>=12) {
    551             /* double-nibble length spread across two bytes */
    552             length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
    553             lengthByte&=0xf;
    554         } else if((lengthByte /* &0xf0 */)>=0xc0) {
    555             /* double-nibble length spread across this one byte */
    556             length=(uint16_t)((lengthByte&0x3f)+12);
    557         } else {
    558             /* single-nibble length in MSBs */
    559             length=(uint16_t)(lengthByte>>4);
    560             lengthByte&=0xf;
    561         }
    562 
    563         *offsets++=offset;
    564         *lengths++=length;
    565 
    566         offset+=length;
    567         ++i;
    568 
    569         /* read odd nibble - LSBs of lengthByte */
    570         if((lengthByte&0xf0)==0) {
    571             /* this nibble was not consumed for a double-nibble length above */
    572             length=lengthByte;
    573             if(length<12) {
    574                 /* single-nibble length in LSBs */
    575                 *offsets++=offset;
    576                 *lengths++=length;
    577 
    578                 offset+=length;
    579                 ++i;
    580             }
    581         } else {
    582             length=0;   /* prevent double-nibble detection in the next iteration */
    583         }
    584     }
    585 
    586     /* now, s is at the first group string */
    587     return s;
    588 }
    589 
    590 static uint16_t
    591 expandGroupName(UCharNames *names, const uint16_t *group,
    592                 uint16_t lineNumber, UCharNameChoice nameChoice,
    593                 char *buffer, uint16_t bufferLength) {
    594     uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
    595     const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
    596     s=expandGroupLengths(s, offsets, lengths);
    597     return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
    598                       buffer, bufferLength);
    599 }
    600 
    601 static uint16_t
    602 getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
    603         char *buffer, uint16_t bufferLength) {
    604     const uint16_t *group=getGroup(names, code);
    605     if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
    606         return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
    607                                buffer, bufferLength);
    608     } else {
    609         /* group not found */
    610         /* zero-terminate */
    611         if(bufferLength>0) {
    612             *buffer=0;
    613         }
    614         return 0;
    615     }
    616 }
    617 
    618 /*
    619  * enumGroupNames() enumerates all the names in a 32-group
    620  * and either calls the enumerator function or finds a given input name.
    621  */
    622 static UBool
    623 enumGroupNames(UCharNames *names, const uint16_t *group,
    624                UChar32 start, UChar32 end,
    625                UEnumCharNamesFn *fn, void *context,
    626                UCharNameChoice nameChoice) {
    627     uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
    628     const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
    629 
    630     s=expandGroupLengths(s, offsets, lengths);
    631     if(fn!=DO_FIND_NAME) {
    632         char buffer[200];
    633         uint16_t length;
    634 
    635         while(start<=end) {
    636             length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
    637             if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
    638                 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
    639             }
    640             /* here, we assume that the buffer is large enough */
    641             if(length>0) {
    642                 if(!fn(context, start, nameChoice, buffer, length)) {
    643                     return FALSE;
    644                 }
    645             }
    646             ++start;
    647         }
    648     } else {
    649         const char *otherName=((FindName *)context)->otherName;
    650         while(start<=end) {
    651             if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
    652                 ((FindName *)context)->code=start;
    653                 return FALSE;
    654             }
    655             ++start;
    656         }
    657     }
    658     return TRUE;
    659 }
    660 
    661 /*
    662  * enumExtNames enumerate extended names.
    663  * It only needs to do it if it is called with a real function and not
    664  * with the dummy DO_FIND_NAME, because u_charFromName() does a check
    665  * for extended names by itself.
    666  */
    667 static UBool
    668 enumExtNames(UChar32 start, UChar32 end,
    669              UEnumCharNamesFn *fn, void *context)
    670 {
    671     if(fn!=DO_FIND_NAME) {
    672         char buffer[200];
    673         uint16_t length;
    674 
    675         while(start<=end) {
    676             buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
    677             /* here, we assume that the buffer is large enough */
    678             if(length>0) {
    679                 if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
    680                     return FALSE;
    681                 }
    682             }
    683             ++start;
    684         }
    685     }
    686 
    687     return TRUE;
    688 }
    689 
    690 static UBool
    691 enumNames(UCharNames *names,
    692           UChar32 start, UChar32 limit,
    693           UEnumCharNamesFn *fn, void *context,
    694           UCharNameChoice nameChoice) {
    695     uint16_t startGroupMSB, endGroupMSB, groupCount;
    696     const uint16_t *group, *groupLimit;
    697 
    698     startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
    699     endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
    700 
    701     /* find the group that contains start, or the highest before it */
    702     group=getGroup(names, start);
    703 
    704     if(startGroupMSB==endGroupMSB) {
    705         if(startGroupMSB==group[GROUP_MSB]) {
    706             /* if start and limit-1 are in the same group, then enumerate only in that one */
    707             return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
    708         }
    709     } else {
    710         const uint16_t *groups=GET_GROUPS(names);
    711         groupCount=*groups++;
    712         groupLimit=groups+groupCount*GROUP_LENGTH;
    713 
    714         if(startGroupMSB==group[GROUP_MSB]) {
    715             /* enumerate characters in the partial start group */
    716             if((start&GROUP_MASK)!=0) {
    717                 if(!enumGroupNames(names, group,
    718                                    start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
    719                                    fn, context, nameChoice)) {
    720                     return FALSE;
    721                 }
    722                 group=NEXT_GROUP(group); /* continue with the next group */
    723             }
    724         } else if(startGroupMSB>group[GROUP_MSB]) {
    725             /* make sure that we start enumerating with the first group after start */
    726             const uint16_t *nextGroup=NEXT_GROUP(group);
    727             if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
    728                 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
    729                 if (end > limit) {
    730                     end = limit;
    731                 }
    732                 if (!enumExtNames(start, end - 1, fn, context)) {
    733                     return FALSE;
    734                 }
    735             }
    736             group=nextGroup;
    737         }
    738 
    739         /* enumerate entire groups between the start- and end-groups */
    740         while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
    741             const uint16_t *nextGroup;
    742             start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
    743             if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
    744                 return FALSE;
    745             }
    746             nextGroup=NEXT_GROUP(group);
    747             if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
    748                 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
    749                 if (end > limit) {
    750                     end = limit;
    751                 }
    752                 if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
    753                     return FALSE;
    754                 }
    755             }
    756             group=nextGroup;
    757         }
    758 
    759         /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
    760         if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
    761             return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
    762         } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
    763             UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
    764             if (next > start) {
    765                 start = next;
    766             }
    767         } else {
    768             return TRUE;
    769         }
    770     }
    771 
    772     /* we have not found a group, which means everything is made of
    773        extended names. */
    774     if (nameChoice == U_EXTENDED_CHAR_NAME) {
    775         if (limit > UCHAR_MAX_VALUE + 1) {
    776             limit = UCHAR_MAX_VALUE + 1;
    777         }
    778         return enumExtNames(start, limit - 1, fn, context);
    779     }
    780 
    781     return TRUE;
    782 }
    783 
    784 static uint16_t
    785 writeFactorSuffix(const uint16_t *factors, uint16_t count,
    786                   const char *s, /* suffix elements */
    787                   uint32_t code,
    788                   uint16_t indexes[8], /* output fields from here */
    789                   const char *elementBases[8], const char *elements[8],
    790                   char *buffer, uint16_t bufferLength) {
    791     uint16_t i, factor, bufferPos=0;
    792     char c;
    793 
    794     /* write elements according to the factors */
    795 
    796     /*
    797      * the factorized elements are determined by modulo arithmetic
    798      * with the factors of this algorithm
    799      *
    800      * note that for fewer operations, count is decremented here
    801      */
    802     --count;
    803     for(i=count; i>0; --i) {
    804         factor=factors[i];
    805         indexes[i]=(uint16_t)(code%factor);
    806         code/=factor;
    807     }
    808     /*
    809      * we don't need to calculate the last modulus because start<=code<=end
    810      * guarantees here that code<=factors[0]
    811      */
    812     indexes[0]=(uint16_t)code;
    813 
    814     /* write each element */
    815     for(;;) {
    816         if(elementBases!=NULL) {
    817             *elementBases++=s;
    818         }
    819 
    820         /* skip indexes[i] strings */
    821         factor=indexes[i];
    822         while(factor>0) {
    823             while(*s++!=0) {}
    824             --factor;
    825         }
    826         if(elements!=NULL) {
    827             *elements++=s;
    828         }
    829 
    830         /* write element */
    831         while((c=*s++)!=0) {
    832             WRITE_CHAR(buffer, bufferLength, bufferPos, c);
    833         }
    834 
    835         /* we do not need to perform the rest of this loop for i==count - break here */
    836         if(i>=count) {
    837             break;
    838         }
    839 
    840         /* skip the rest of the strings for this factors[i] */
    841         factor=(uint16_t)(factors[i]-indexes[i]-1);
    842         while(factor>0) {
    843             while(*s++!=0) {}
    844             --factor;
    845         }
    846 
    847         ++i;
    848     }
    849 
    850     /* zero-terminate */
    851     if(bufferLength>0) {
    852         *buffer=0;
    853     }
    854 
    855     return bufferPos;
    856 }
    857 
    858 /*
    859  * Important:
    860  * Parts of findAlgName() are almost the same as some of getAlgName().
    861  * Fixes must be applied to both.
    862  */
    863 static uint16_t
    864 getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
    865         char *buffer, uint16_t bufferLength) {
    866     uint16_t bufferPos=0;
    867 
    868     /*
    869      * Do not write algorithmic Unicode 1.0 names because
    870      * Unihan names are the same as the modern ones,
    871      * extension A was only introduced with Unicode 3.0, and
    872      * the Hangul syllable block was moved and changed around Unicode 1.1.5.
    873      */
    874     if(nameChoice==U_UNICODE_10_CHAR_NAME) {
    875         /* zero-terminate */
    876         if(bufferLength>0) {
    877             *buffer=0;
    878         }
    879         return 0;
    880     }
    881 
    882     switch(range->type) {
    883     case 0: {
    884         /* name = prefix hex-digits */
    885         const char *s=(const char *)(range+1);
    886         char c;
    887 
    888         uint16_t i, count;
    889 
    890         /* copy prefix */
    891         while((c=*s++)!=0) {
    892             WRITE_CHAR(buffer, bufferLength, bufferPos, c);
    893         }
    894 
    895         /* write hexadecimal code point value */
    896         count=range->variant;
    897 
    898         /* zero-terminate */
    899         if(count<bufferLength) {
    900             buffer[count]=0;
    901         }
    902 
    903         for(i=count; i>0;) {
    904             if(--i<bufferLength) {
    905                 c=(char)(code&0xf);
    906                 if(c<10) {
    907                     c+='0';
    908                 } else {
    909                     c+='A'-10;
    910                 }
    911                 buffer[i]=c;
    912             }
    913             code>>=4;
    914         }
    915 
    916         bufferPos+=count;
    917         break;
    918     }
    919     case 1: {
    920         /* name = prefix factorized-elements */
    921         uint16_t indexes[8];
    922         const uint16_t *factors=(const uint16_t *)(range+1);
    923         uint16_t count=range->variant;
    924         const char *s=(const char *)(factors+count);
    925         char c;
    926 
    927         /* copy prefix */
    928         while((c=*s++)!=0) {
    929             WRITE_CHAR(buffer, bufferLength, bufferPos, c);
    930         }
    931 
    932         bufferPos+=writeFactorSuffix(factors, count,
    933                                      s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
    934         break;
    935     }
    936     default:
    937         /* undefined type */
    938         /* zero-terminate */
    939         if(bufferLength>0) {
    940             *buffer=0;
    941         }
    942         break;
    943     }
    944 
    945     return bufferPos;
    946 }
    947 
    948 /*
    949  * Important: enumAlgNames() and findAlgName() are almost the same.
    950  * Any fix must be applied to both.
    951  */
    952 static UBool
    953 enumAlgNames(AlgorithmicRange *range,
    954              UChar32 start, UChar32 limit,
    955              UEnumCharNamesFn *fn, void *context,
    956              UCharNameChoice nameChoice) {
    957     char buffer[200];
    958     uint16_t length;
    959 
    960     if(nameChoice==U_UNICODE_10_CHAR_NAME) {
    961         return TRUE;
    962     }
    963 
    964     switch(range->type) {
    965     case 0: {
    966         char *s, *end;
    967         char c;
    968 
    969         /* get the full name of the start character */
    970         length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
    971         if(length<=0) {
    972             return TRUE;
    973         }
    974 
    975         /* call the enumerator function with this first character */
    976         if(!fn(context, start, nameChoice, buffer, length)) {
    977             return FALSE;
    978         }
    979 
    980         /* go to the end of the name; all these names have the same length */
    981         end=buffer;
    982         while(*end!=0) {
    983             ++end;
    984         }
    985 
    986         /* enumerate the rest of the names */
    987         while(++start<limit) {
    988             /* increment the hexadecimal number on a character-basis */
    989             s=end;
    990             for (;;) {
    991                 c=*--s;
    992                 if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
    993                     *s=(char)(c+1);
    994                     break;
    995                 } else if(c=='9') {
    996                     *s='A';
    997                     break;
    998                 } else if(c=='F') {
    999                     *s='0';
   1000                 }
   1001             }
   1002 
   1003             if(!fn(context, start, nameChoice, buffer, length)) {
   1004                 return FALSE;
   1005             }
   1006         }
   1007         break;
   1008     }
   1009     case 1: {
   1010         uint16_t indexes[8];
   1011         const char *elementBases[8], *elements[8];
   1012         const uint16_t *factors=(const uint16_t *)(range+1);
   1013         uint16_t count=range->variant;
   1014         const char *s=(const char *)(factors+count);
   1015         char *suffix, *t;
   1016         uint16_t prefixLength, i, idx;
   1017 
   1018         char c;
   1019 
   1020         /* name = prefix factorized-elements */
   1021 
   1022         /* copy prefix */
   1023         suffix=buffer;
   1024         prefixLength=0;
   1025         while((c=*s++)!=0) {
   1026             *suffix++=c;
   1027             ++prefixLength;
   1028         }
   1029 
   1030         /* append the suffix of the start character */
   1031         length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
   1032                                               s, (uint32_t)start-range->start,
   1033                                               indexes, elementBases, elements,
   1034                                               suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
   1035 
   1036         /* call the enumerator function with this first character */
   1037         if(!fn(context, start, nameChoice, buffer, length)) {
   1038             return FALSE;
   1039         }
   1040 
   1041         /* enumerate the rest of the names */
   1042         while(++start<limit) {
   1043             /* increment the indexes in lexical order bound by the factors */
   1044             i=count;
   1045             for (;;) {
   1046                 idx=(uint16_t)(indexes[--i]+1);
   1047                 if(idx<factors[i]) {
   1048                     /* skip one index and its element string */
   1049                     indexes[i]=idx;
   1050                     s=elements[i];
   1051                     while(*s++!=0) {
   1052                     }
   1053                     elements[i]=s;
   1054                     break;
   1055                 } else {
   1056                     /* reset this index to 0 and its element string to the first one */
   1057                     indexes[i]=0;
   1058                     elements[i]=elementBases[i];
   1059                 }
   1060             }
   1061 
   1062             /* to make matters a little easier, just append all elements to the suffix */
   1063             t=suffix;
   1064             length=prefixLength;
   1065             for(i=0; i<count; ++i) {
   1066                 s=elements[i];
   1067                 while((c=*s++)!=0) {
   1068                     *t++=c;
   1069                     ++length;
   1070                 }
   1071             }
   1072             /* zero-terminate */
   1073             *t=0;
   1074 
   1075             if(!fn(context, start, nameChoice, buffer, length)) {
   1076                 return FALSE;
   1077             }
   1078         }
   1079         break;
   1080     }
   1081     default:
   1082         /* undefined type */
   1083         break;
   1084     }
   1085 
   1086     return TRUE;
   1087 }
   1088 
   1089 /*
   1090  * findAlgName() is almost the same as enumAlgNames() except that it
   1091  * returns the code point for a name if it fits into the range.
   1092  * It returns 0xffff otherwise.
   1093  */
   1094 static UChar32
   1095 findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
   1096     UChar32 code;
   1097 
   1098     if(nameChoice==U_UNICODE_10_CHAR_NAME) {
   1099         return 0xffff;
   1100     }
   1101 
   1102     switch(range->type) {
   1103     case 0: {
   1104         /* name = prefix hex-digits */
   1105         const char *s=(const char *)(range+1);
   1106         char c;
   1107 
   1108         uint16_t i, count;
   1109 
   1110         /* compare prefix */
   1111         while((c=*s++)!=0) {
   1112             if((char)c!=*otherName++) {
   1113                 return 0xffff;
   1114             }
   1115         }
   1116 
   1117         /* read hexadecimal code point value */
   1118         count=range->variant;
   1119         code=0;
   1120         for(i=0; i<count; ++i) {
   1121             c=*otherName++;
   1122             if('0'<=c && c<='9') {
   1123                 code=(code<<4)|(c-'0');
   1124             } else if('A'<=c && c<='F') {
   1125                 code=(code<<4)|(c-'A'+10);
   1126             } else {
   1127                 return 0xffff;
   1128             }
   1129         }
   1130 
   1131         /* does it fit into the range? */
   1132         if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
   1133             return code;
   1134         }
   1135         break;
   1136     }
   1137     case 1: {
   1138         char buffer[64];
   1139         uint16_t indexes[8];
   1140         const char *elementBases[8], *elements[8];
   1141         const uint16_t *factors=(const uint16_t *)(range+1);
   1142         uint16_t count=range->variant;
   1143         const char *s=(const char *)(factors+count), *t;
   1144         UChar32 start, limit;
   1145         uint16_t i, idx;
   1146 
   1147         char c;
   1148 
   1149         /* name = prefix factorized-elements */
   1150 
   1151         /* compare prefix */
   1152         while((c=*s++)!=0) {
   1153             if((char)c!=*otherName++) {
   1154                 return 0xffff;
   1155             }
   1156         }
   1157 
   1158         start=(UChar32)range->start;
   1159         limit=(UChar32)(range->end+1);
   1160 
   1161         /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
   1162         writeFactorSuffix(factors, count, s, 0,
   1163                           indexes, elementBases, elements, buffer, sizeof(buffer));
   1164 
   1165         /* compare the first suffix */
   1166         if(0==uprv_strcmp(otherName, buffer)) {
   1167             return start;
   1168         }
   1169 
   1170         /* enumerate and compare the rest of the suffixes */
   1171         while(++start<limit) {
   1172             /* increment the indexes in lexical order bound by the factors */
   1173             i=count;
   1174             for (;;) {
   1175                 idx=(uint16_t)(indexes[--i]+1);
   1176                 if(idx<factors[i]) {
   1177                     /* skip one index and its element string */
   1178                     indexes[i]=idx;
   1179                     s=elements[i];
   1180                     while(*s++!=0) {}
   1181                     elements[i]=s;
   1182                     break;
   1183                 } else {
   1184                     /* reset this index to 0 and its element string to the first one */
   1185                     indexes[i]=0;
   1186                     elements[i]=elementBases[i];
   1187                 }
   1188             }
   1189 
   1190             /* to make matters a little easier, just compare all elements of the suffix */
   1191             t=otherName;
   1192             for(i=0; i<count; ++i) {
   1193                 s=elements[i];
   1194                 while((c=*s++)!=0) {
   1195                     if(c!=*t++) {
   1196                         s=""; /* does not match */
   1197                         i=99;
   1198                     }
   1199                 }
   1200             }
   1201             if(i<99 && *t==0) {
   1202                 return start;
   1203             }
   1204         }
   1205         break;
   1206     }
   1207     default:
   1208         /* undefined type */
   1209         break;
   1210     }
   1211 
   1212     return 0xffff;
   1213 }
   1214 
   1215 /* sets of name characters, maximum name lengths ---------------------------- */
   1216 
   1217 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
   1218 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
   1219 
   1220 static int32_t
   1221 calcStringSetLength(uint32_t set[8], const char *s) {
   1222     int32_t length=0;
   1223     char c;
   1224 
   1225     while((c=*s++)!=0) {
   1226         SET_ADD(set, c);
   1227         ++length;
   1228     }
   1229     return length;
   1230 }
   1231 
   1232 static int32_t
   1233 calcAlgNameSetsLengths(int32_t maxNameLength) {
   1234     AlgorithmicRange *range;
   1235     uint32_t *p;
   1236     uint32_t rangeCount;
   1237     int32_t length;
   1238 
   1239     /* enumerate algorithmic ranges */
   1240     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
   1241     rangeCount=*p;
   1242     range=(AlgorithmicRange *)(p+1);
   1243     while(rangeCount>0) {
   1244         switch(range->type) {
   1245         case 0:
   1246             /* name = prefix + (range->variant times) hex-digits */
   1247             /* prefix */
   1248             length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
   1249             if(length>maxNameLength) {
   1250                 maxNameLength=length;
   1251             }
   1252             break;
   1253         case 1: {
   1254             /* name = prefix factorized-elements */
   1255             const uint16_t *factors=(const uint16_t *)(range+1);
   1256             const char *s;
   1257             int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
   1258 
   1259             /* prefix length */
   1260             s=(const char *)(factors+count);
   1261             length=calcStringSetLength(gNameSet, s);
   1262             s+=length+1; /* start of factor suffixes */
   1263 
   1264             /* get the set and maximum factor suffix length for each factor */
   1265             for(i=0; i<count; ++i) {
   1266                 maxFactorLength=0;
   1267                 for(factor=factors[i]; factor>0; --factor) {
   1268                     factorLength=calcStringSetLength(gNameSet, s);
   1269                     s+=factorLength+1;
   1270                     if(factorLength>maxFactorLength) {
   1271                         maxFactorLength=factorLength;
   1272                     }
   1273                 }
   1274                 length+=maxFactorLength;
   1275             }
   1276 
   1277             if(length>maxNameLength) {
   1278                 maxNameLength=length;
   1279             }
   1280             break;
   1281         }
   1282         default:
   1283             /* unknown type */
   1284             break;
   1285         }
   1286 
   1287         range=(AlgorithmicRange *)((uint8_t *)range+range->size);
   1288         --rangeCount;
   1289     }
   1290     return maxNameLength;
   1291 }
   1292 
   1293 static int32_t
   1294 calcExtNameSetsLengths(int32_t maxNameLength) {
   1295     int32_t i, length;
   1296 
   1297     for(i=0; i<LENGTHOF(charCatNames); ++i) {
   1298         /*
   1299          * for each category, count the length of the category name
   1300          * plus 9=
   1301          * 2 for <>
   1302          * 1 for -
   1303          * 6 for most hex digits per code point
   1304          */
   1305         length=9+calcStringSetLength(gNameSet, charCatNames[i]);
   1306         if(length>maxNameLength) {
   1307             maxNameLength=length;
   1308         }
   1309     }
   1310     return maxNameLength;
   1311 }
   1312 
   1313 static int32_t
   1314 calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
   1315                   uint32_t set[8],
   1316                   const uint8_t **pLine, const uint8_t *lineLimit) {
   1317     const uint8_t *line=*pLine;
   1318     int32_t length=0, tokenLength;
   1319     uint16_t c, token;
   1320 
   1321     while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
   1322         if(c>=tokenCount) {
   1323             /* implicit letter */
   1324             SET_ADD(set, c);
   1325             ++length;
   1326         } else {
   1327             token=tokens[c];
   1328             if(token==(uint16_t)(-2)) {
   1329                 /* this is a lead byte for a double-byte token */
   1330                 c=c<<8|*line++;
   1331                 token=tokens[c];
   1332             }
   1333             if(token==(uint16_t)(-1)) {
   1334                 /* explicit letter */
   1335                 SET_ADD(set, c);
   1336                 ++length;
   1337             } else {
   1338                 /* count token word */
   1339                 if(tokenLengths!=NULL) {
   1340                     /* use cached token length */
   1341                     tokenLength=tokenLengths[c];
   1342                     if(tokenLength==0) {
   1343                         tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
   1344                         tokenLengths[c]=(int8_t)tokenLength;
   1345                     }
   1346                 } else {
   1347                     tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
   1348                 }
   1349                 length+=tokenLength;
   1350             }
   1351         }
   1352     }
   1353 
   1354     *pLine=line;
   1355     return length;
   1356 }
   1357 
   1358 static void
   1359 calcGroupNameSetsLengths(int32_t maxNameLength) {
   1360     uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
   1361 
   1362     uint16_t *tokens=(uint16_t *)uCharNames+8;
   1363     uint16_t tokenCount=*tokens++;
   1364     uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
   1365 
   1366     int8_t *tokenLengths;
   1367 
   1368     const uint16_t *group;
   1369     const uint8_t *s, *line, *lineLimit;
   1370 
   1371     int32_t groupCount, lineNumber, length;
   1372 
   1373     tokenLengths=(int8_t *)uprv_malloc(tokenCount);
   1374     if(tokenLengths!=NULL) {
   1375         uprv_memset(tokenLengths, 0, tokenCount);
   1376     }
   1377 
   1378     group=GET_GROUPS(uCharNames);
   1379     groupCount=*group++;
   1380 
   1381     /* enumerate all groups */
   1382     while(groupCount>0) {
   1383         s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
   1384         s=expandGroupLengths(s, offsets, lengths);
   1385 
   1386         /* enumerate all lines in each group */
   1387         for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
   1388             line=s+offsets[lineNumber];
   1389             length=lengths[lineNumber];
   1390             if(length==0) {
   1391                 continue;
   1392             }
   1393 
   1394             lineLimit=line+length;
   1395 
   1396             /* read regular name */
   1397             length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
   1398             if(length>maxNameLength) {
   1399                 maxNameLength=length;
   1400             }
   1401             if(line==lineLimit) {
   1402                 continue;
   1403             }
   1404 
   1405             /* read Unicode 1.0 name */
   1406             length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
   1407             if(length>maxNameLength) {
   1408                 maxNameLength=length;
   1409             }
   1410             if(line==lineLimit) {
   1411                 continue;
   1412             }
   1413 
   1414             /* read ISO comment */
   1415             /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
   1416         }
   1417 
   1418         group=NEXT_GROUP(group);
   1419         --groupCount;
   1420     }
   1421 
   1422     if(tokenLengths!=NULL) {
   1423         uprv_free(tokenLengths);
   1424     }
   1425 
   1426     /* set gMax... - name length last for threading */
   1427     gMaxNameLength=maxNameLength;
   1428 }
   1429 
   1430 static UBool
   1431 calcNameSetsLengths(UErrorCode *pErrorCode) {
   1432     static const char extChars[]="0123456789ABCDEF<>-";
   1433     int32_t i, maxNameLength;
   1434 
   1435     if(gMaxNameLength!=0) {
   1436         return TRUE;
   1437     }
   1438 
   1439     if(!isDataLoaded(pErrorCode)) {
   1440         return FALSE;
   1441     }
   1442 
   1443     /* set hex digits, used in various names, and <>-, used in extended names */
   1444     for(i=0; i<sizeof(extChars)-1; ++i) {
   1445         SET_ADD(gNameSet, extChars[i]);
   1446     }
   1447 
   1448     /* set sets and lengths from algorithmic names */
   1449     maxNameLength=calcAlgNameSetsLengths(0);
   1450 
   1451     /* set sets and lengths from extended names */
   1452     maxNameLength=calcExtNameSetsLengths(maxNameLength);
   1453 
   1454     /* set sets and lengths from group names, set global maximum values */
   1455     calcGroupNameSetsLengths(maxNameLength);
   1456 
   1457     return TRUE;
   1458 }
   1459 
   1460 /* public API --------------------------------------------------------------- */
   1461 
   1462 U_CAPI int32_t U_EXPORT2
   1463 u_charName(UChar32 code, UCharNameChoice nameChoice,
   1464            char *buffer, int32_t bufferLength,
   1465            UErrorCode *pErrorCode) {
   1466     AlgorithmicRange *algRange;
   1467     uint32_t *p;
   1468     uint32_t i;
   1469     int32_t length;
   1470 
   1471     /* check the argument values */
   1472     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   1473         return 0;
   1474     } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
   1475               bufferLength<0 || (bufferLength>0 && buffer==NULL)
   1476     ) {
   1477         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1478         return 0;
   1479     }
   1480 
   1481     if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
   1482         return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
   1483     }
   1484 
   1485     length=0;
   1486 
   1487     /* try algorithmic names first */
   1488     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
   1489     i=*p;
   1490     algRange=(AlgorithmicRange *)(p+1);
   1491     while(i>0) {
   1492         if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
   1493             length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
   1494             break;
   1495         }
   1496         algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
   1497         --i;
   1498     }
   1499 
   1500     if(i==0) {
   1501         if (nameChoice == U_EXTENDED_CHAR_NAME) {
   1502             length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
   1503             if (!length) {
   1504                 /* extended character name */
   1505                 length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
   1506             }
   1507         } else {
   1508             /* normal character name */
   1509             length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
   1510         }
   1511     }
   1512 
   1513     return u_terminateChars(buffer, bufferLength, length, pErrorCode);
   1514 }
   1515 
   1516 U_CAPI int32_t U_EXPORT2
   1517 u_getISOComment(UChar32 c,
   1518                 char *dest, int32_t destCapacity,
   1519                 UErrorCode *pErrorCode) {
   1520     int32_t length;
   1521 
   1522     /* check the argument values */
   1523     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   1524         return 0;
   1525     } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
   1526         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1527         return 0;
   1528     }
   1529 
   1530     if((uint32_t)c>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
   1531         return u_terminateChars(dest, destCapacity, 0, pErrorCode);
   1532     }
   1533 
   1534     /* the ISO comment is stored like a normal character name */
   1535     length=getName(uCharNames, (uint32_t)c, U_ISO_COMMENT, dest, (uint16_t)destCapacity);
   1536     return u_terminateChars(dest, destCapacity, length, pErrorCode);
   1537 }
   1538 
   1539 U_CAPI UChar32 U_EXPORT2
   1540 u_charFromName(UCharNameChoice nameChoice,
   1541                const char *name,
   1542                UErrorCode *pErrorCode) {
   1543     char upper[120], lower[120];
   1544     FindName findName;
   1545     AlgorithmicRange *algRange;
   1546     uint32_t *p;
   1547     uint32_t i;
   1548     UChar32 cp = 0;
   1549     char c0;
   1550     UChar32 error = 0xffff;     /* Undefined, but use this for backwards compatibility. */
   1551 
   1552     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   1553         return error;
   1554     }
   1555 
   1556     if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
   1557         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1558         return error;
   1559     }
   1560 
   1561     if(!isDataLoaded(pErrorCode)) {
   1562         return error;
   1563     }
   1564 
   1565     /* construct the uppercase and lowercase of the name first */
   1566     for(i=0; i<sizeof(upper); ++i) {
   1567         if((c0=*name++)!=0) {
   1568             upper[i]=uprv_toupper(c0);
   1569             lower[i]=uprv_tolower(c0);
   1570         } else {
   1571             upper[i]=lower[i]=0;
   1572             break;
   1573         }
   1574     }
   1575     if(i==sizeof(upper)) {
   1576         /* name too long, there is no such character */
   1577         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
   1578         return error;
   1579     }
   1580 
   1581     /* try extended names first */
   1582     if (lower[0] == '<') {
   1583         if (nameChoice == U_EXTENDED_CHAR_NAME) {
   1584             if (lower[--i] == '>') {
   1585                 for (--i; lower[i] && lower[i] != '-'; --i) {
   1586                 }
   1587 
   1588                 if (lower[i] == '-') { /* We've got a category. */
   1589                     uint32_t cIdx;
   1590 
   1591                     lower[i] = 0;
   1592 
   1593                     for (++i; lower[i] != '>'; ++i) {
   1594                         if (lower[i] >= '0' && lower[i] <= '9') {
   1595                             cp = (cp << 4) + lower[i] - '0';
   1596                         } else if (lower[i] >= 'a' && lower[i] <= 'f') {
   1597                             cp = (cp << 4) + lower[i] - 'a' + 10;
   1598                         } else {
   1599                             *pErrorCode = U_ILLEGAL_CHAR_FOUND;
   1600                             return error;
   1601                         }
   1602                     }
   1603 
   1604                     /* Now validate the category name.
   1605                        We could use a binary search, or a trie, if
   1606                        we really wanted to. */
   1607 
   1608                     for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
   1609 
   1610                         if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
   1611                             if (getCharCat(cp) == cIdx) {
   1612                                 return cp;
   1613                             }
   1614                             break;
   1615                         }
   1616                     }
   1617                 }
   1618             }
   1619         }
   1620 
   1621         *pErrorCode = U_ILLEGAL_CHAR_FOUND;
   1622         return error;
   1623     }
   1624 
   1625     /* try algorithmic names now */
   1626     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
   1627     i=*p;
   1628     algRange=(AlgorithmicRange *)(p+1);
   1629     while(i>0) {
   1630         if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
   1631             return cp;
   1632         }
   1633         algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
   1634         --i;
   1635     }
   1636 
   1637     /* normal character name */
   1638     findName.otherName=upper;
   1639     findName.code=error;
   1640     enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
   1641     if (findName.code == error) {
   1642          *pErrorCode = U_ILLEGAL_CHAR_FOUND;
   1643     }
   1644     return findName.code;
   1645 }
   1646 
   1647 U_CAPI void U_EXPORT2
   1648 u_enumCharNames(UChar32 start, UChar32 limit,
   1649                 UEnumCharNamesFn *fn,
   1650                 void *context,
   1651                 UCharNameChoice nameChoice,
   1652                 UErrorCode *pErrorCode) {
   1653     AlgorithmicRange *algRange;
   1654     uint32_t *p;
   1655     uint32_t i;
   1656 
   1657     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   1658         return;
   1659     }
   1660 
   1661     if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
   1662         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1663         return;
   1664     }
   1665 
   1666     if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
   1667         limit = UCHAR_MAX_VALUE + 1;
   1668     }
   1669     if((uint32_t)start>=(uint32_t)limit) {
   1670         return;
   1671     }
   1672 
   1673     if(!isDataLoaded(pErrorCode)) {
   1674         return;
   1675     }
   1676 
   1677     /* interleave the data-driven ones with the algorithmic ones */
   1678     /* iterate over all algorithmic ranges; assume that they are in ascending order */
   1679     p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
   1680     i=*p;
   1681     algRange=(AlgorithmicRange *)(p+1);
   1682     while(i>0) {
   1683         /* enumerate the character names before the current algorithmic range */
   1684         /* here: start<limit */
   1685         if((uint32_t)start<algRange->start) {
   1686             if((uint32_t)limit<=algRange->start) {
   1687                 enumNames(uCharNames, start, limit, fn, context, nameChoice);
   1688                 return;
   1689             }
   1690             if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
   1691                 return;
   1692             }
   1693             start=(UChar32)algRange->start;
   1694         }
   1695         /* enumerate the character names in the current algorithmic range */
   1696         /* here: algRange->start<=start<limit */
   1697         if((uint32_t)start<=algRange->end) {
   1698             if((uint32_t)limit<=(algRange->end+1)) {
   1699                 enumAlgNames(algRange, start, limit, fn, context, nameChoice);
   1700                 return;
   1701             }
   1702             if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
   1703                 return;
   1704             }
   1705             start=(UChar32)algRange->end+1;
   1706         }
   1707         /* continue to the next algorithmic range (here: start<limit) */
   1708         algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
   1709         --i;
   1710     }
   1711     /* enumerate the character names after the last algorithmic range */
   1712     enumNames(uCharNames, start, limit, fn, context, nameChoice);
   1713 }
   1714 
   1715 U_CAPI int32_t U_EXPORT2
   1716 uprv_getMaxCharNameLength() {
   1717     UErrorCode errorCode=U_ZERO_ERROR;
   1718     if(calcNameSetsLengths(&errorCode)) {
   1719         return gMaxNameLength;
   1720     } else {
   1721         return 0;
   1722     }
   1723 }
   1724 
   1725 /**
   1726  * Converts the char set cset into a Unicode set uset.
   1727  * @param cset Set of 256 bit flags corresponding to a set of chars.
   1728  * @param uset USet to receive characters. Existing contents are deleted.
   1729  */
   1730 static void
   1731 charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
   1732     UChar us[256];
   1733     char cs[256];
   1734 
   1735     int32_t i, length;
   1736     UErrorCode errorCode;
   1737 
   1738     errorCode=U_ZERO_ERROR;
   1739 
   1740     if(!calcNameSetsLengths(&errorCode)) {
   1741         return;
   1742     }
   1743 
   1744     /* build a char string with all chars that are used in character names */
   1745     length=0;
   1746     for(i=0; i<256; ++i) {
   1747         if(SET_CONTAINS(cset, i)) {
   1748             cs[length++]=(char)i;
   1749         }
   1750     }
   1751 
   1752     /* convert the char string to a UChar string */
   1753     u_charsToUChars(cs, us, length);
   1754 
   1755     /* add each UChar to the USet */
   1756     for(i=0; i<length; ++i) {
   1757         if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
   1758             sa->add(sa->set, us[i]);
   1759         }
   1760     }
   1761 }
   1762 
   1763 /**
   1764  * Fills set with characters that are used in Unicode character names.
   1765  * @param set USet to receive characters.
   1766  */
   1767 U_CAPI void U_EXPORT2
   1768 uprv_getCharNameCharacters(const USetAdder *sa) {
   1769     charSetToUSet(gNameSet, sa);
   1770 }
   1771 
   1772 /* data swapping ------------------------------------------------------------ */
   1773 
   1774 /*
   1775  * The token table contains non-negative entries for token bytes,
   1776  * and -1 for bytes that represent themselves in the data file's charset.
   1777  * -2 entries are used for lead bytes.
   1778  *
   1779  * Direct bytes (-1 entries) must be translated from the input charset family
   1780  * to the output charset family.
   1781  * makeTokenMap() writes a permutation mapping for this.
   1782  * Use it once for single-/lead-byte tokens and once more for all trail byte
   1783  * tokens. (';' is an unused trail byte marked with -1.)
   1784  */
   1785 static void
   1786 makeTokenMap(const UDataSwapper *ds,
   1787              int16_t tokens[], uint16_t tokenCount,
   1788              uint8_t map[256],
   1789              UErrorCode *pErrorCode) {
   1790     UBool usedOutChar[256];
   1791     uint16_t i, j;
   1792     uint8_t c1, c2;
   1793 
   1794     if(U_FAILURE(*pErrorCode)) {
   1795         return;
   1796     }
   1797 
   1798     if(ds->inCharset==ds->outCharset) {
   1799         /* Same charset family: identity permutation */
   1800         for(i=0; i<256; ++i) {
   1801             map[i]=(uint8_t)i;
   1802         }
   1803     } else {
   1804         uprv_memset(map, 0, 256);
   1805         uprv_memset(usedOutChar, 0, 256);
   1806 
   1807         if(tokenCount>256) {
   1808             tokenCount=256;
   1809         }
   1810 
   1811         /* set the direct bytes (byte 0 always maps to itself) */
   1812         for(i=1; i<tokenCount; ++i) {
   1813             if(tokens[i]==-1) {
   1814                 /* convert the direct byte character */
   1815                 c1=(uint8_t)i;
   1816                 ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
   1817                 if(U_FAILURE(*pErrorCode)) {
   1818                     udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
   1819                                      i, ds->inCharset);
   1820                     return;
   1821                 }
   1822 
   1823                 /* enter the converted character into the map and mark it used */
   1824                 map[c1]=c2;
   1825                 usedOutChar[c2]=TRUE;
   1826             }
   1827         }
   1828 
   1829         /* set the mappings for the rest of the permutation */
   1830         for(i=j=1; i<tokenCount; ++i) {
   1831             /* set mappings that were not set for direct bytes */
   1832             if(map[i]==0) {
   1833                 /* set an output byte value that was not used as an output byte above */
   1834                 while(usedOutChar[j]) {
   1835                     ++j;
   1836                 }
   1837                 map[i]=(uint8_t)j++;
   1838             }
   1839         }
   1840 
   1841         /*
   1842          * leave mappings at tokenCount and above unset if tokenCount<256
   1843          * because they won't be used
   1844          */
   1845     }
   1846 }
   1847 
   1848 U_CAPI int32_t U_EXPORT2
   1849 uchar_swapNames(const UDataSwapper *ds,
   1850                 const void *inData, int32_t length, void *outData,
   1851                 UErrorCode *pErrorCode) {
   1852     const UDataInfo *pInfo;
   1853     int32_t headerSize;
   1854 
   1855     const uint8_t *inBytes;
   1856     uint8_t *outBytes;
   1857 
   1858     uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
   1859              offset, i, count, stringsCount;
   1860 
   1861     const AlgorithmicRange *inRange;
   1862     AlgorithmicRange *outRange;
   1863 
   1864     /* udata_swapDataHeader checks the arguments */
   1865     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
   1866     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   1867         return 0;
   1868     }
   1869 
   1870     /* check data format and format version */
   1871     pInfo=(const UDataInfo *)((const char *)inData+4);
   1872     if(!(
   1873         pInfo->dataFormat[0]==0x75 &&   /* dataFormat="unam" */
   1874         pInfo->dataFormat[1]==0x6e &&
   1875         pInfo->dataFormat[2]==0x61 &&
   1876         pInfo->dataFormat[3]==0x6d &&
   1877         pInfo->formatVersion[0]==1
   1878     )) {
   1879         udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
   1880                          pInfo->dataFormat[0], pInfo->dataFormat[1],
   1881                          pInfo->dataFormat[2], pInfo->dataFormat[3],
   1882                          pInfo->formatVersion[0]);
   1883         *pErrorCode=U_UNSUPPORTED_ERROR;
   1884         return 0;
   1885     }
   1886 
   1887     inBytes=(const uint8_t *)inData+headerSize;
   1888     outBytes=(uint8_t *)outData+headerSize;
   1889     if(length<0) {
   1890         algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
   1891     } else {
   1892         length-=headerSize;
   1893         if( length<20 ||
   1894             (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
   1895         ) {
   1896             udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
   1897                              length);
   1898             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1899             return 0;
   1900         }
   1901     }
   1902 
   1903     if(length<0) {
   1904         /* preflighting: iterate through algorithmic ranges */
   1905         offset=algNamesOffset;
   1906         count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
   1907         offset+=4;
   1908 
   1909         for(i=0; i<count; ++i) {
   1910             inRange=(const AlgorithmicRange *)(inBytes+offset);
   1911             offset+=ds->readUInt16(inRange->size);
   1912         }
   1913     } else {
   1914         /* swap data */
   1915         const uint16_t *p;
   1916         uint16_t *q, *temp;
   1917 
   1918         int16_t tokens[512];
   1919         uint16_t tokenCount;
   1920 
   1921         uint8_t map[256], trailMap[256];
   1922 
   1923         /* copy the data for inaccessible bytes */
   1924         if(inBytes!=outBytes) {
   1925             uprv_memcpy(outBytes, inBytes, length);
   1926         }
   1927 
   1928         /* the initial 4 offsets first */
   1929         tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
   1930         groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
   1931         groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
   1932         ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
   1933 
   1934         /*
   1935          * now the tokens table
   1936          * it needs to be permutated along with the compressed name strings
   1937          */
   1938         p=(const uint16_t *)(inBytes+16);
   1939         q=(uint16_t *)(outBytes+16);
   1940 
   1941         /* read and swap the tokenCount */
   1942         tokenCount=ds->readUInt16(*p);
   1943         ds->swapArray16(ds, p, 2, q, pErrorCode);
   1944         ++p;
   1945         ++q;
   1946 
   1947         /* read the first 512 tokens and make the token maps */
   1948         if(tokenCount<=512) {
   1949             count=tokenCount;
   1950         } else {
   1951             count=512;
   1952         }
   1953         for(i=0; i<count; ++i) {
   1954             tokens[i]=udata_readInt16(ds, p[i]);
   1955         }
   1956         for(; i<512; ++i) {
   1957             tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
   1958         }
   1959         makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
   1960         makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
   1961         if(U_FAILURE(*pErrorCode)) {
   1962             return 0;
   1963         }
   1964 
   1965         /*
   1966          * swap and permutate the tokens
   1967          * go through a temporary array to support in-place swapping
   1968          */
   1969         temp=(uint16_t *)uprv_malloc(tokenCount*2);
   1970         if(temp==NULL) {
   1971             udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
   1972                              tokenCount);
   1973             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
   1974             return 0;
   1975         }
   1976 
   1977         /* swap and permutate single-/lead-byte tokens */
   1978         for(i=0; i<tokenCount && i<256; ++i) {
   1979             ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
   1980         }
   1981 
   1982         /* swap and permutate trail-byte tokens */
   1983         for(; i<tokenCount; ++i) {
   1984             ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
   1985         }
   1986 
   1987         /* copy the result into the output and free the temporary array */
   1988         uprv_memcpy(q, temp, tokenCount*2);
   1989         uprv_free(temp);
   1990 
   1991         /*
   1992          * swap the token strings but not a possible padding byte after
   1993          * the terminating NUL of the last string
   1994          */
   1995         udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
   1996                                     outBytes+tokenStringOffset, pErrorCode);
   1997         if(U_FAILURE(*pErrorCode)) {
   1998             udata_printError(ds, "uchar_swapNames(token strings) failed\n");
   1999             return 0;
   2000         }
   2001 
   2002         /* swap the group table */
   2003         count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
   2004         ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
   2005                            outBytes+groupsOffset, pErrorCode);
   2006 
   2007         /*
   2008          * swap the group strings
   2009          * swap the string bytes but not the nibble-encoded string lengths
   2010          */
   2011         if(ds->inCharset!=ds->outCharset) {
   2012             uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
   2013 
   2014             const uint8_t *inStrings, *nextInStrings;
   2015             uint8_t *outStrings;
   2016 
   2017             uint8_t c;
   2018 
   2019             inStrings=inBytes+groupStringOffset;
   2020             outStrings=outBytes+groupStringOffset;
   2021 
   2022             stringsCount=algNamesOffset-groupStringOffset;
   2023 
   2024             /* iterate through string groups until only a few padding bytes are left */
   2025             while(stringsCount>32) {
   2026                 nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
   2027 
   2028                 /* move past the length bytes */
   2029                 stringsCount-=(uint32_t)(nextInStrings-inStrings);
   2030                 outStrings+=nextInStrings-inStrings;
   2031                 inStrings=nextInStrings;
   2032 
   2033                 count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
   2034                 stringsCount-=count;
   2035 
   2036                 /* swap the string bytes using map[] and trailMap[] */
   2037                 while(count>0) {
   2038                     c=*inStrings++;
   2039                     *outStrings++=map[c];
   2040                     if(tokens[c]!=-2) {
   2041                         --count;
   2042                     } else {
   2043                         /* token lead byte: swap the trail byte, too */
   2044                         *outStrings++=trailMap[*inStrings++];
   2045                         count-=2;
   2046                     }
   2047                 }
   2048             }
   2049         }
   2050 
   2051         /* swap the algorithmic ranges */
   2052         offset=algNamesOffset;
   2053         count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
   2054         ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
   2055         offset+=4;
   2056 
   2057         for(i=0; i<count; ++i) {
   2058             if(offset>(uint32_t)length) {
   2059                 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
   2060                                  length, i);
   2061                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   2062                 return 0;
   2063             }
   2064 
   2065             inRange=(const AlgorithmicRange *)(inBytes+offset);
   2066             outRange=(AlgorithmicRange *)(outBytes+offset);
   2067             offset+=ds->readUInt16(inRange->size);
   2068 
   2069             ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
   2070             ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
   2071             switch(inRange->type) {
   2072             case 0:
   2073                 /* swap prefix string */
   2074                 ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
   2075                                     outRange+1, pErrorCode);
   2076                 if(U_FAILURE(*pErrorCode)) {
   2077                     udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
   2078                                      i);
   2079                     return 0;
   2080                 }
   2081                 break;
   2082             case 1:
   2083                 {
   2084                     /* swap factors and the prefix and factor strings */
   2085                     uint32_t factorsCount;
   2086 
   2087                     factorsCount=inRange->variant;
   2088                     p=(const uint16_t *)(inRange+1);
   2089                     q=(uint16_t *)(outRange+1);
   2090                     ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
   2091 
   2092                     /* swap the strings, up to the last terminating NUL */
   2093                     p+=factorsCount;
   2094                     q+=factorsCount;
   2095                     stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
   2096                     while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
   2097                         --stringsCount;
   2098                     }
   2099                     ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
   2100                 }
   2101                 break;
   2102             default:
   2103                 udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
   2104                                  inRange->type, i);
   2105                 *pErrorCode=U_UNSUPPORTED_ERROR;
   2106                 return 0;
   2107             }
   2108         }
   2109     }
   2110 
   2111     return headerSize+(int32_t)offset;
   2112 }
   2113 
   2114 /*
   2115  * Hey, Emacs, please set the following:
   2116  *
   2117  * Local Variables:
   2118  * indent-tabs-mode: nil
   2119  * End:
   2120  *
   2121  */
   2122