Home | History | Annotate | Download | only in toolutil
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2011-2012, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ppucd.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2011dec11
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #include "unicode/utypes.h"
     16 #include "unicode/uchar.h"
     17 #include "charstr.h"
     18 #include "cstring.h"
     19 #include "ppucd.h"
     20 #include "uassert.h"
     21 #include "uparse.h"
     22 
     23 #include <stdio.h>
     24 #include <string.h>
     25 
     26 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 PropertyNames::~PropertyNames() {}
     31 
     32 int32_t
     33 PropertyNames::getPropertyEnum(const char *name) const {
     34     return u_getPropertyEnum(name);
     35 }
     36 
     37 int32_t
     38 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
     39     return u_getPropertyValueEnum((UProperty)property, name);
     40 }
     41 
     42 UniProps::UniProps()
     43         : start(U_SENTINEL), end(U_SENTINEL),
     44           bmg(U_SENTINEL),
     45           scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
     46           digitValue(-1), numericValue(NULL),
     47           name(NULL), nameAlias(NULL) {
     48     memset(binProps, 0, sizeof(binProps));
     49     memset(intProps, 0, sizeof(intProps));
     50     memset(age, 0, 4);
     51 }
     52 
     53 UniProps::~UniProps() {}
     54 
     55 const int32_t PreparsedUCD::kNumLineBuffers;
     56 
     57 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
     58         : icuPnames(new PropertyNames()), pnames(icuPnames),
     59           file(NULL),
     60           defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
     61           lineNumber(0),
     62           lineType(NO_LINE),
     63           fieldLimit(NULL), lineLimit(NULL) {
     64     if(U_FAILURE(errorCode)) { return; }
     65 
     66     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
     67         filename=NULL;
     68         file=stdin;
     69     } else {
     70         file=fopen(filename, "r");
     71     }
     72     if(file==NULL) {
     73         perror("error opening preparsed UCD");
     74         fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
     75         errorCode=U_FILE_ACCESS_ERROR;
     76         return;
     77     }
     78 
     79     memset(ucdVersion, 0, 4);
     80     lines[0][0]=0;
     81 }
     82 
     83 PreparsedUCD::~PreparsedUCD() {
     84     if(file!=stdin) {
     85         fclose(file);
     86     }
     87     delete icuPnames;
     88 }
     89 
     90 // Same order as the LineType values.
     91 static const char *lineTypeStrings[]={
     92     NULL,
     93     NULL,
     94     "ucd",
     95     "property",
     96     "binary",
     97     "value",
     98     "defaults",
     99     "block",
    100     "cp",
    101     "algnamesrange"
    102 };
    103 
    104 PreparsedUCD::LineType
    105 PreparsedUCD::readLine(UErrorCode &errorCode) {
    106     if(U_FAILURE(errorCode)) { return NO_LINE; }
    107     // Select the next available line buffer.
    108     while(!isLineBufferAvailable(lineIndex)) {
    109         ++lineIndex;
    110         if (lineIndex == kNumLineBuffers) {
    111             lineIndex = 0;
    112         }
    113     }
    114     char *line=lines[lineIndex];
    115     *line=0;
    116     lineLimit=fieldLimit=line;
    117     lineType=NO_LINE;
    118     char *result=fgets(line, sizeof(lines[0]), file);
    119     if(result==NULL) {
    120         if(ferror(file)) {
    121             perror("error reading preparsed UCD");
    122             fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
    123             errorCode=U_FILE_ACCESS_ERROR;
    124         }
    125         return NO_LINE;
    126     }
    127     ++lineNumber;
    128     if(*line=='#') {
    129         fieldLimit=strchr(line, 0);
    130         return lineType=EMPTY_LINE;
    131     }
    132     // Remove trailing /r/n.
    133     char c;
    134     char *limit=strchr(line, 0);
    135     while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
    136     // Remove trailing white space.
    137     while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
    138     *limit=0;
    139     lineLimit=limit;
    140     if(line==limit) {
    141         fieldLimit=limit;
    142         return lineType=EMPTY_LINE;
    143     }
    144     // Split by ';'.
    145     char *semi=line;
    146     while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
    147     fieldLimit=strchr(line, 0);
    148     // Determine the line type.
    149     int32_t type;
    150     for(type=EMPTY_LINE+1;; ++type) {
    151         if(type==LINE_TYPE_COUNT) {
    152             fprintf(stderr,
    153                     "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
    154                     line, (long)lineNumber);
    155             errorCode=U_PARSE_ERROR;
    156             return NO_LINE;
    157         }
    158         if(0==strcmp(line, lineTypeStrings[type])) {
    159             break;
    160         }
    161     }
    162     lineType=(LineType)type;
    163     if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
    164         u_versionFromString(ucdVersion, fieldLimit+1);
    165     }
    166     return lineType;
    167 }
    168 
    169 const char *
    170 PreparsedUCD::firstField() {
    171     char *field=lines[lineIndex];
    172     fieldLimit=strchr(field, 0);
    173     return field;
    174 }
    175 
    176 const char *
    177 PreparsedUCD::nextField() {
    178     if(fieldLimit==lineLimit) { return NULL; }
    179     char *field=fieldLimit+1;
    180     fieldLimit=strchr(field, 0);
    181     return field;
    182 }
    183 
    184 const UniProps *
    185 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
    186     if(U_FAILURE(errorCode)) { return NULL; }
    187     newValues.clear();
    188     if(!lineHasPropertyValues()) {
    189         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    190         return NULL;
    191     }
    192     firstField();
    193     const char *field=nextField();
    194     if(field==NULL) {
    195         // No range field after the type.
    196         fprintf(stderr,
    197                 "error in preparsed UCD: missing default/block/cp range field "
    198                 "(no second field) on line %ld\n",
    199                 (long)lineNumber);
    200         errorCode=U_PARSE_ERROR;
    201         return NULL;
    202     }
    203     UChar32 start, end;
    204     if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
    205     UniProps *props;
    206     switch(lineType) {
    207     case DEFAULTS_LINE:
    208         if(defaultLineIndex>=0) {
    209             fprintf(stderr,
    210                     "error in preparsed UCD: second line with default properties on line %ld\n",
    211                     (long)lineNumber);
    212             errorCode=U_PARSE_ERROR;
    213             return NULL;
    214         }
    215         if(start!=0 || end!=0x10ffff) {
    216             fprintf(stderr,
    217                     "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
    218                     field, (long)lineNumber);
    219             errorCode=U_PARSE_ERROR;
    220             return NULL;
    221         }
    222         props=&defaultProps;
    223         defaultLineIndex=lineIndex;
    224         break;
    225     case BLOCK_LINE:
    226         blockProps=defaultProps;  // Block inherits default properties.
    227         props=&blockProps;
    228         blockLineIndex=lineIndex;
    229         break;
    230     case CP_LINE:
    231         if(blockProps.start<=start && end<=blockProps.end) {
    232             // Code point range fully inside the last block inherits the block properties.
    233             cpProps=blockProps;
    234         } else if(start>blockProps.end || end<blockProps.start) {
    235             // Code point range fully outside the last block inherits the default properties.
    236             cpProps=defaultProps;
    237         } else {
    238             // Code point range partially overlapping with the last block is illegal.
    239             fprintf(stderr,
    240                     "error in preparsed UCD: cp range %s on line %ld only "
    241                     "partially overlaps with block range %04lX..%04lX\n",
    242                     field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
    243             errorCode=U_PARSE_ERROR;
    244             return NULL;
    245         }
    246         props=&cpProps;
    247         break;
    248     default:
    249         // Will not occur because of the range check above.
    250         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    251         return NULL;
    252     }
    253     props->start=start;
    254     props->end=end;
    255     while((field=nextField())!=NULL) {
    256         if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
    257     }
    258     return props;
    259 }
    260 
    261 static const struct {
    262     const char *name;
    263     int32_t prop;
    264 } ppucdProperties[]={
    265     { "Name_Alias", PPUCD_NAME_ALIAS },
    266     { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
    267     { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
    268 };
    269 
    270 // Returns TRUE for "ok to continue parsing fields".
    271 UBool
    272 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
    273                             UErrorCode &errorCode) {
    274     CharString pBuffer;
    275     const char *p=field;
    276     const char *v=strchr(p, '=');
    277     int binaryValue;
    278     if(*p=='-') {
    279         if(v!=NULL) {
    280             fprintf(stderr,
    281                     "error in preparsed UCD: mix of binary-property-no and "
    282                     "enum-property syntax '%s' on line %ld\n",
    283                     field, (long)lineNumber);
    284             errorCode=U_PARSE_ERROR;
    285             return FALSE;
    286         }
    287         binaryValue=0;
    288         ++p;
    289     } else if(v==NULL) {
    290         binaryValue=1;
    291     } else {
    292         binaryValue=-1;
    293         // Copy out the property name rather than modifying the field (writing a NUL).
    294         pBuffer.append(p, (int32_t)(v-p), errorCode);
    295         p=pBuffer.data();
    296         ++v;
    297     }
    298     int32_t prop=pnames->getPropertyEnum(p);
    299     if(prop<0) {
    300         for(int32_t i=0;; ++i) {
    301             if(i==LENGTHOF(ppucdProperties)) {
    302                 // Ignore unknown property names.
    303                 return TRUE;
    304             }
    305             if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
    306                 prop=ppucdProperties[i].prop;
    307                 U_ASSERT(prop>=0);
    308                 break;
    309             }
    310         }
    311     }
    312     if(prop<UCHAR_BINARY_LIMIT) {
    313         if(binaryValue>=0) {
    314             props.binProps[prop]=(UBool)binaryValue;
    315         } else {
    316             // No binary value for a binary property.
    317             fprintf(stderr,
    318                     "error in preparsed UCD: enum-property syntax '%s' "
    319                     "for binary property on line %ld\n",
    320                     field, (long)lineNumber);
    321             errorCode=U_PARSE_ERROR;
    322         }
    323     } else if(binaryValue>=0) {
    324         // Binary value for a non-binary property.
    325         fprintf(stderr,
    326                 "error in preparsed UCD: binary-property syntax '%s' "
    327                 "for non-binary property on line %ld\n",
    328                 field, (long)lineNumber);
    329         errorCode=U_PARSE_ERROR;
    330     } else if (prop < UCHAR_INT_START) {
    331         fprintf(stderr,
    332                 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
    333                 prop, (long)lineNumber);
    334         errorCode=U_PARSE_ERROR;
    335     } else if(prop<UCHAR_INT_LIMIT) {
    336         int32_t value=pnames->getPropertyValueEnum(prop, v);
    337         if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
    338             // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
    339             char *end;
    340             unsigned long ccc=uprv_strtoul(v, &end, 10);
    341             if(v<end && *end==0 && ccc<=254) {
    342                 value=(int32_t)ccc;
    343             }
    344         }
    345         if(value==UCHAR_INVALID_CODE) {
    346             fprintf(stderr,
    347                     "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
    348                     field, (long)lineNumber);
    349             errorCode=U_PARSE_ERROR;
    350         } else {
    351             props.intProps[prop-UCHAR_INT_START]=value;
    352         }
    353     } else if(*v=='<') {
    354         // Do not parse default values like <code point>, just set null values.
    355         switch(prop) {
    356         case UCHAR_BIDI_MIRRORING_GLYPH:
    357             props.bmg=U_SENTINEL;
    358             break;
    359         case UCHAR_SIMPLE_CASE_FOLDING:
    360             props.scf=U_SENTINEL;
    361             break;
    362         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
    363             props.slc=U_SENTINEL;
    364             break;
    365         case UCHAR_SIMPLE_TITLECASE_MAPPING:
    366             props.stc=U_SENTINEL;
    367             break;
    368         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
    369             props.suc=U_SENTINEL;
    370             break;
    371         case UCHAR_CASE_FOLDING:
    372             props.cf.remove();
    373             break;
    374         case UCHAR_LOWERCASE_MAPPING:
    375             props.lc.remove();
    376             break;
    377         case UCHAR_TITLECASE_MAPPING:
    378             props.tc.remove();
    379             break;
    380         case UCHAR_UPPERCASE_MAPPING:
    381             props.uc.remove();
    382             break;
    383         case UCHAR_SCRIPT_EXTENSIONS:
    384             props.scx.clear();
    385             break;
    386         default:
    387             fprintf(stderr,
    388                     "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
    389                     field, (long)lineNumber);
    390             errorCode=U_PARSE_ERROR;
    391         }
    392     } else {
    393         char c;
    394         switch(prop) {
    395         case UCHAR_NUMERIC_VALUE:
    396             props.numericValue=v;
    397             c=*v;
    398             if('0'<=c && c<='9' && v[1]==0) {
    399                 props.digitValue=c-'0';
    400             } else {
    401                 props.digitValue=-1;
    402             }
    403             break;
    404         case UCHAR_NAME:
    405             props.name=v;
    406             break;
    407         case UCHAR_AGE:
    408             u_versionFromString(props.age, v);  // Writes 0.0.0.0 if v is not numeric.
    409             break;
    410         case UCHAR_BIDI_MIRRORING_GLYPH:
    411             props.bmg=parseCodePoint(v, errorCode);
    412             break;
    413         case UCHAR_SIMPLE_CASE_FOLDING:
    414             props.scf=parseCodePoint(v, errorCode);
    415             break;
    416         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
    417             props.slc=parseCodePoint(v, errorCode);
    418             break;
    419         case UCHAR_SIMPLE_TITLECASE_MAPPING:
    420             props.stc=parseCodePoint(v, errorCode);
    421             break;
    422         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
    423             props.suc=parseCodePoint(v, errorCode);
    424             break;
    425         case UCHAR_CASE_FOLDING:
    426             parseString(v, props.cf, errorCode);
    427             break;
    428         case UCHAR_LOWERCASE_MAPPING:
    429             parseString(v, props.lc, errorCode);
    430             break;
    431         case UCHAR_TITLECASE_MAPPING:
    432             parseString(v, props.tc, errorCode);
    433             break;
    434         case UCHAR_UPPERCASE_MAPPING:
    435             parseString(v, props.uc, errorCode);
    436             break;
    437         case PPUCD_NAME_ALIAS:
    438             props.nameAlias=v;
    439             break;
    440         case PPUCD_CONDITIONAL_CASE_MAPPINGS:
    441         case PPUCD_TURKIC_CASE_FOLDING:
    442             // No need to parse their values: They are hardcoded in the runtime library.
    443             break;
    444         case UCHAR_SCRIPT_EXTENSIONS:
    445             parseScriptExtensions(v, props.scx, errorCode);
    446             break;
    447         default:
    448             // Ignore unhandled properties.
    449             return TRUE;
    450         }
    451     }
    452     if(U_SUCCESS(errorCode)) {
    453         newValues.add((UChar32)prop);
    454         return TRUE;
    455     } else {
    456         return FALSE;
    457     }
    458 }
    459 
    460 UBool
    461 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
    462     if(U_FAILURE(errorCode)) { return FALSE; }
    463     if(lineType!=ALG_NAMES_RANGE_LINE) {
    464         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    465         return FALSE;
    466     }
    467     firstField();
    468     const char *field=nextField();
    469     if(field==NULL) {
    470         // No range field after the type.
    471         fprintf(stderr,
    472                 "error in preparsed UCD: missing algnamesrange range field "
    473                 "(no second field) on line %ld\n",
    474                 (long)lineNumber);
    475         errorCode=U_PARSE_ERROR;
    476         return FALSE;
    477     }
    478     return parseCodePointRange(field, start, end, errorCode);
    479 }
    480 
    481 UChar32
    482 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
    483     char *end;
    484     uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
    485     if(end<=s || *end!=0 || value>=0x110000) {
    486         fprintf(stderr,
    487                 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
    488                 s, (long)lineNumber);
    489         errorCode=U_PARSE_ERROR;
    490         return U_SENTINEL;
    491     }
    492     return (UChar32)value;
    493 }
    494 
    495 UBool
    496 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
    497     uint32_t st, e;
    498     u_parseCodePointRange(s, &st, &e, &errorCode);
    499     if(U_FAILURE(errorCode)) {
    500         fprintf(stderr,
    501                 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
    502                 s, (long)lineNumber);
    503         return FALSE;
    504     }
    505     start=(UChar32)st;
    506     end=(UChar32)e;
    507     return TRUE;
    508 }
    509 
    510 void
    511 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
    512     UChar *buffer=uni.getBuffer(-1);
    513     int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
    514     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    515         errorCode=U_ZERO_ERROR;
    516         uni.releaseBuffer(0);
    517         buffer=uni.getBuffer(length);
    518         length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
    519     }
    520     uni.releaseBuffer(length);
    521     if(U_FAILURE(errorCode)) {
    522         fprintf(stderr,
    523                 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
    524                 s, (long)lineNumber);
    525     }
    526 }
    527 
    528 void
    529 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
    530     if(U_FAILURE(errorCode)) { return; }
    531     scx.clear();
    532     CharString scString;
    533     for(;;) {
    534         const char *scs;
    535         const char *scLimit=strchr(s, ' ');
    536         if(scLimit!=NULL) {
    537             scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
    538             if(U_FAILURE(errorCode)) { return; }
    539         } else {
    540             scs=s;
    541         }
    542         int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
    543         if(script==UCHAR_INVALID_CODE) {
    544             fprintf(stderr,
    545                     "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
    546                     scs, (long)lineNumber);
    547             errorCode=U_PARSE_ERROR;
    548             return;
    549         } else if(scx.contains(script)) {
    550             fprintf(stderr,
    551                     "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
    552                     scs, (long)lineNumber);
    553             errorCode=U_PARSE_ERROR;
    554             return;
    555         } else {
    556             scx.add(script);
    557         }
    558         if(scLimit!=NULL) {
    559             s=scLimit+1;
    560         } else {
    561             break;
    562         }
    563     }
    564     if(scx.isEmpty()) {
    565         fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
    566         errorCode=U_PARSE_ERROR;
    567     }
    568 }
    569 
    570 U_NAMESPACE_END
    571