Home | History | Annotate | Download | only in toolutil
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2011-2014, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ppucd.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2011dec11
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #include "unicode/utypes.h"
     16 #include "unicode/uchar.h"
     17 #include "charstr.h"
     18 #include "cstring.h"
     19 #include "ppucd.h"
     20 #include "uassert.h"
     21 #include "uparse.h"
     22 
     23 #include <stdio.h>
     24 #include <string.h>
     25 
     26 U_NAMESPACE_BEGIN
     27 
     28 PropertyNames::~PropertyNames() {}
     29 
     30 int32_t
     31 PropertyNames::getPropertyEnum(const char *name) const {
     32     return u_getPropertyEnum(name);
     33 }
     34 
     35 int32_t
     36 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
     37     return u_getPropertyValueEnum((UProperty)property, name);
     38 }
     39 
     40 UniProps::UniProps()
     41         : start(U_SENTINEL), end(U_SENTINEL),
     42           bmg(U_SENTINEL), bpb(U_SENTINEL),
     43           scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
     44           digitValue(-1), numericValue(NULL),
     45           name(NULL), nameAlias(NULL) {
     46     memset(binProps, 0, sizeof(binProps));
     47     memset(intProps, 0, sizeof(intProps));
     48     memset(age, 0, 4);
     49 }
     50 
     51 UniProps::~UniProps() {}
     52 
     53 const int32_t PreparsedUCD::kNumLineBuffers;
     54 
     55 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
     56         : icuPnames(new PropertyNames()), pnames(icuPnames),
     57           file(NULL),
     58           defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
     59           lineNumber(0),
     60           lineType(NO_LINE),
     61           fieldLimit(NULL), lineLimit(NULL) {
     62     if(U_FAILURE(errorCode)) { return; }
     63 
     64     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
     65         filename=NULL;
     66         file=stdin;
     67     } else {
     68         file=fopen(filename, "r");
     69     }
     70     if(file==NULL) {
     71         perror("error opening preparsed UCD");
     72         fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
     73         errorCode=U_FILE_ACCESS_ERROR;
     74         return;
     75     }
     76 
     77     memset(ucdVersion, 0, 4);
     78     lines[0][0]=0;
     79 }
     80 
     81 PreparsedUCD::~PreparsedUCD() {
     82     if(file!=stdin) {
     83         fclose(file);
     84     }
     85     delete icuPnames;
     86 }
     87 
     88 // Same order as the LineType values.
     89 static const char *lineTypeStrings[]={
     90     NULL,
     91     NULL,
     92     "ucd",
     93     "property",
     94     "binary",
     95     "value",
     96     "defaults",
     97     "block",
     98     "cp",
     99     "algnamesrange"
    100 };
    101 
    102 PreparsedUCD::LineType
    103 PreparsedUCD::readLine(UErrorCode &errorCode) {
    104     if(U_FAILURE(errorCode)) { return NO_LINE; }
    105     // Select the next available line buffer.
    106     while(!isLineBufferAvailable(lineIndex)) {
    107         ++lineIndex;
    108         if (lineIndex == kNumLineBuffers) {
    109             lineIndex = 0;
    110         }
    111     }
    112     char *line=lines[lineIndex];
    113     *line=0;
    114     lineLimit=fieldLimit=line;
    115     lineType=NO_LINE;
    116     char *result=fgets(line, sizeof(lines[0]), file);
    117     if(result==NULL) {
    118         if(ferror(file)) {
    119             perror("error reading preparsed UCD");
    120             fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
    121             errorCode=U_FILE_ACCESS_ERROR;
    122         }
    123         return NO_LINE;
    124     }
    125     ++lineNumber;
    126     if(*line=='#') {
    127         fieldLimit=strchr(line, 0);
    128         return lineType=EMPTY_LINE;
    129     }
    130     // Remove trailing /r/n.
    131     char c;
    132     char *limit=strchr(line, 0);
    133     while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
    134     // Remove trailing white space.
    135     while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
    136     *limit=0;
    137     lineLimit=limit;
    138     if(line==limit) {
    139         fieldLimit=limit;
    140         return lineType=EMPTY_LINE;
    141     }
    142     // Split by ';'.
    143     char *semi=line;
    144     while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
    145     fieldLimit=strchr(line, 0);
    146     // Determine the line type.
    147     int32_t type;
    148     for(type=EMPTY_LINE+1;; ++type) {
    149         if(type==LINE_TYPE_COUNT) {
    150             fprintf(stderr,
    151                     "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
    152                     line, (long)lineNumber);
    153             errorCode=U_PARSE_ERROR;
    154             return NO_LINE;
    155         }
    156         if(0==strcmp(line, lineTypeStrings[type])) {
    157             break;
    158         }
    159     }
    160     lineType=(LineType)type;
    161     if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
    162         u_versionFromString(ucdVersion, fieldLimit+1);
    163     }
    164     return lineType;
    165 }
    166 
    167 const char *
    168 PreparsedUCD::firstField() {
    169     char *field=lines[lineIndex];
    170     fieldLimit=strchr(field, 0);
    171     return field;
    172 }
    173 
    174 const char *
    175 PreparsedUCD::nextField() {
    176     if(fieldLimit==lineLimit) { return NULL; }
    177     char *field=fieldLimit+1;
    178     fieldLimit=strchr(field, 0);
    179     return field;
    180 }
    181 
    182 const UniProps *
    183 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
    184     if(U_FAILURE(errorCode)) { return NULL; }
    185     newValues.clear();
    186     if(!lineHasPropertyValues()) {
    187         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    188         return NULL;
    189     }
    190     firstField();
    191     const char *field=nextField();
    192     if(field==NULL) {
    193         // No range field after the type.
    194         fprintf(stderr,
    195                 "error in preparsed UCD: missing default/block/cp range field "
    196                 "(no second field) on line %ld\n",
    197                 (long)lineNumber);
    198         errorCode=U_PARSE_ERROR;
    199         return NULL;
    200     }
    201     UChar32 start, end;
    202     if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
    203     UniProps *props;
    204     switch(lineType) {
    205     case DEFAULTS_LINE:
    206         if(defaultLineIndex>=0) {
    207             fprintf(stderr,
    208                     "error in preparsed UCD: second line with default properties on line %ld\n",
    209                     (long)lineNumber);
    210             errorCode=U_PARSE_ERROR;
    211             return NULL;
    212         }
    213         if(start!=0 || end!=0x10ffff) {
    214             fprintf(stderr,
    215                     "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
    216                     field, (long)lineNumber);
    217             errorCode=U_PARSE_ERROR;
    218             return NULL;
    219         }
    220         props=&defaultProps;
    221         defaultLineIndex=lineIndex;
    222         break;
    223     case BLOCK_LINE:
    224         blockProps=defaultProps;  // Block inherits default properties.
    225         props=&blockProps;
    226         blockLineIndex=lineIndex;
    227         break;
    228     case CP_LINE:
    229         if(blockProps.start<=start && end<=blockProps.end) {
    230             // Code point range fully inside the last block inherits the block properties.
    231             cpProps=blockProps;
    232         } else if(start>blockProps.end || end<blockProps.start) {
    233             // Code point range fully outside the last block inherits the default properties.
    234             cpProps=defaultProps;
    235         } else {
    236             // Code point range partially overlapping with the last block is illegal.
    237             fprintf(stderr,
    238                     "error in preparsed UCD: cp range %s on line %ld only "
    239                     "partially overlaps with block range %04lX..%04lX\n",
    240                     field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
    241             errorCode=U_PARSE_ERROR;
    242             return NULL;
    243         }
    244         props=&cpProps;
    245         break;
    246     default:
    247         // Will not occur because of the range check above.
    248         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    249         return NULL;
    250     }
    251     props->start=start;
    252     props->end=end;
    253     while((field=nextField())!=NULL) {
    254         if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
    255     }
    256     return props;
    257 }
    258 
    259 static const struct {
    260     const char *name;
    261     int32_t prop;
    262 } ppucdProperties[]={
    263     { "Name_Alias", PPUCD_NAME_ALIAS },
    264     { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
    265     { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
    266 };
    267 
    268 // Returns TRUE for "ok to continue parsing fields".
    269 UBool
    270 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
    271                             UErrorCode &errorCode) {
    272     CharString pBuffer;
    273     const char *p=field;
    274     const char *v=strchr(p, '=');
    275     int binaryValue;
    276     if(*p=='-') {
    277         if(v!=NULL) {
    278             fprintf(stderr,
    279                     "error in preparsed UCD: mix of binary-property-no and "
    280                     "enum-property syntax '%s' on line %ld\n",
    281                     field, (long)lineNumber);
    282             errorCode=U_PARSE_ERROR;
    283             return FALSE;
    284         }
    285         binaryValue=0;
    286         ++p;
    287     } else if(v==NULL) {
    288         binaryValue=1;
    289     } else {
    290         binaryValue=-1;
    291         // Copy out the property name rather than modifying the field (writing a NUL).
    292         pBuffer.append(p, (int32_t)(v-p), errorCode);
    293         p=pBuffer.data();
    294         ++v;
    295     }
    296     int32_t prop=pnames->getPropertyEnum(p);
    297     if(prop<0) {
    298         for(int32_t i=0;; ++i) {
    299             if(i==UPRV_LENGTHOF(ppucdProperties)) {
    300                 // Ignore unknown property names.
    301                 return TRUE;
    302             }
    303             if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
    304                 prop=ppucdProperties[i].prop;
    305                 U_ASSERT(prop>=0);
    306                 break;
    307             }
    308         }
    309     }
    310     if(prop<UCHAR_BINARY_LIMIT) {
    311         if(binaryValue>=0) {
    312             props.binProps[prop]=(UBool)binaryValue;
    313         } else {
    314             // No binary value for a binary property.
    315             fprintf(stderr,
    316                     "error in preparsed UCD: enum-property syntax '%s' "
    317                     "for binary property on line %ld\n",
    318                     field, (long)lineNumber);
    319             errorCode=U_PARSE_ERROR;
    320         }
    321     } else if(binaryValue>=0) {
    322         // Binary value for a non-binary property.
    323         fprintf(stderr,
    324                 "error in preparsed UCD: binary-property syntax '%s' "
    325                 "for non-binary property on line %ld\n",
    326                 field, (long)lineNumber);
    327         errorCode=U_PARSE_ERROR;
    328     } else if (prop < UCHAR_INT_START) {
    329         fprintf(stderr,
    330                 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
    331                 prop, (long)lineNumber);
    332         errorCode=U_PARSE_ERROR;
    333     } else if(prop<UCHAR_INT_LIMIT) {
    334         int32_t value=pnames->getPropertyValueEnum(prop, v);
    335         if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
    336             // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
    337             char *end;
    338             unsigned long ccc=uprv_strtoul(v, &end, 10);
    339             if(v<end && *end==0 && ccc<=254) {
    340                 value=(int32_t)ccc;
    341             }
    342         }
    343         if(value==UCHAR_INVALID_CODE) {
    344             fprintf(stderr,
    345                     "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
    346                     field, (long)lineNumber);
    347             errorCode=U_PARSE_ERROR;
    348         } else {
    349             props.intProps[prop-UCHAR_INT_START]=value;
    350         }
    351     } else if(*v=='<') {
    352         // Do not parse default values like <code point>, just set null values.
    353         switch(prop) {
    354         case UCHAR_BIDI_MIRRORING_GLYPH:
    355             props.bmg=U_SENTINEL;
    356             break;
    357         case UCHAR_BIDI_PAIRED_BRACKET:
    358             props.bpb=U_SENTINEL;
    359             break;
    360         case UCHAR_SIMPLE_CASE_FOLDING:
    361             props.scf=U_SENTINEL;
    362             break;
    363         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
    364             props.slc=U_SENTINEL;
    365             break;
    366         case UCHAR_SIMPLE_TITLECASE_MAPPING:
    367             props.stc=U_SENTINEL;
    368             break;
    369         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
    370             props.suc=U_SENTINEL;
    371             break;
    372         case UCHAR_CASE_FOLDING:
    373             props.cf.remove();
    374             break;
    375         case UCHAR_LOWERCASE_MAPPING:
    376             props.lc.remove();
    377             break;
    378         case UCHAR_TITLECASE_MAPPING:
    379             props.tc.remove();
    380             break;
    381         case UCHAR_UPPERCASE_MAPPING:
    382             props.uc.remove();
    383             break;
    384         case UCHAR_SCRIPT_EXTENSIONS:
    385             props.scx.clear();
    386             break;
    387         default:
    388             fprintf(stderr,
    389                     "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
    390                     field, (long)lineNumber);
    391             errorCode=U_PARSE_ERROR;
    392         }
    393     } else {
    394         char c;
    395         switch(prop) {
    396         case UCHAR_NUMERIC_VALUE:
    397             props.numericValue=v;
    398             c=*v;
    399             if('0'<=c && c<='9' && v[1]==0) {
    400                 props.digitValue=c-'0';
    401             } else {
    402                 props.digitValue=-1;
    403             }
    404             break;
    405         case UCHAR_NAME:
    406             props.name=v;
    407             break;
    408         case UCHAR_AGE:
    409             u_versionFromString(props.age, v);  // Writes 0.0.0.0 if v is not numeric.
    410             break;
    411         case UCHAR_BIDI_MIRRORING_GLYPH:
    412             props.bmg=parseCodePoint(v, errorCode);
    413             break;
    414         case UCHAR_BIDI_PAIRED_BRACKET:
    415             props.bpb=parseCodePoint(v, errorCode);
    416             break;
    417         case UCHAR_SIMPLE_CASE_FOLDING:
    418             props.scf=parseCodePoint(v, errorCode);
    419             break;
    420         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
    421             props.slc=parseCodePoint(v, errorCode);
    422             break;
    423         case UCHAR_SIMPLE_TITLECASE_MAPPING:
    424             props.stc=parseCodePoint(v, errorCode);
    425             break;
    426         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
    427             props.suc=parseCodePoint(v, errorCode);
    428             break;
    429         case UCHAR_CASE_FOLDING:
    430             parseString(v, props.cf, errorCode);
    431             break;
    432         case UCHAR_LOWERCASE_MAPPING:
    433             parseString(v, props.lc, errorCode);
    434             break;
    435         case UCHAR_TITLECASE_MAPPING:
    436             parseString(v, props.tc, errorCode);
    437             break;
    438         case UCHAR_UPPERCASE_MAPPING:
    439             parseString(v, props.uc, errorCode);
    440             break;
    441         case PPUCD_NAME_ALIAS:
    442             props.nameAlias=v;
    443             break;
    444         case PPUCD_CONDITIONAL_CASE_MAPPINGS:
    445         case PPUCD_TURKIC_CASE_FOLDING:
    446             // No need to parse their values: They are hardcoded in the runtime library.
    447             break;
    448         case UCHAR_SCRIPT_EXTENSIONS:
    449             parseScriptExtensions(v, props.scx, errorCode);
    450             break;
    451         default:
    452             // Ignore unhandled properties.
    453             return TRUE;
    454         }
    455     }
    456     if(U_SUCCESS(errorCode)) {
    457         newValues.add((UChar32)prop);
    458         return TRUE;
    459     } else {
    460         return FALSE;
    461     }
    462 }
    463 
    464 UBool
    465 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
    466     if(U_FAILURE(errorCode)) { return FALSE; }
    467     if(lineType!=ALG_NAMES_RANGE_LINE) {
    468         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    469         return FALSE;
    470     }
    471     firstField();
    472     const char *field=nextField();
    473     if(field==NULL) {
    474         // No range field after the type.
    475         fprintf(stderr,
    476                 "error in preparsed UCD: missing algnamesrange range field "
    477                 "(no second field) on line %ld\n",
    478                 (long)lineNumber);
    479         errorCode=U_PARSE_ERROR;
    480         return FALSE;
    481     }
    482     return parseCodePointRange(field, start, end, errorCode);
    483 }
    484 
    485 UChar32
    486 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
    487     char *end;
    488     uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
    489     if(end<=s || *end!=0 || value>=0x110000) {
    490         fprintf(stderr,
    491                 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
    492                 s, (long)lineNumber);
    493         errorCode=U_PARSE_ERROR;
    494         return U_SENTINEL;
    495     }
    496     return (UChar32)value;
    497 }
    498 
    499 UBool
    500 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
    501     uint32_t st, e;
    502     u_parseCodePointRange(s, &st, &e, &errorCode);
    503     if(U_FAILURE(errorCode)) {
    504         fprintf(stderr,
    505                 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
    506                 s, (long)lineNumber);
    507         return FALSE;
    508     }
    509     start=(UChar32)st;
    510     end=(UChar32)e;
    511     return TRUE;
    512 }
    513 
    514 void
    515 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
    516     UChar *buffer=uni.getBuffer(-1);
    517     int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
    518     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    519         errorCode=U_ZERO_ERROR;
    520         uni.releaseBuffer(0);
    521         buffer=uni.getBuffer(length);
    522         length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
    523     }
    524     uni.releaseBuffer(length);
    525     if(U_FAILURE(errorCode)) {
    526         fprintf(stderr,
    527                 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
    528                 s, (long)lineNumber);
    529     }
    530 }
    531 
    532 void
    533 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
    534     if(U_FAILURE(errorCode)) { return; }
    535     scx.clear();
    536     CharString scString;
    537     for(;;) {
    538         const char *scs;
    539         const char *scLimit=strchr(s, ' ');
    540         if(scLimit!=NULL) {
    541             scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
    542             if(U_FAILURE(errorCode)) { return; }
    543         } else {
    544             scs=s;
    545         }
    546         int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
    547         if(script==UCHAR_INVALID_CODE) {
    548             fprintf(stderr,
    549                     "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
    550                     scs, (long)lineNumber);
    551             errorCode=U_PARSE_ERROR;
    552             return;
    553         } else if(scx.contains(script)) {
    554             fprintf(stderr,
    555                     "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
    556                     scs, (long)lineNumber);
    557             errorCode=U_PARSE_ERROR;
    558             return;
    559         } else {
    560             scx.add(script);
    561         }
    562         if(scLimit!=NULL) {
    563             s=scLimit+1;
    564         } else {
    565             break;
    566         }
    567     }
    568     if(scx.isEmpty()) {
    569         fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
    570         errorCode=U_PARSE_ERROR;
    571     }
    572 }
    573 
    574 U_NAMESPACE_END
    575