Home | History | Annotate | Download | only in toolutil
      1 /*
      2 *******************************************************************************
      3 *   Copyright (C) 2011-2013, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 *   file name:  ppucd.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2011dec11
     12 *   created by: Markus W. Scherer
     13 */
     14 
     15 #include "unicode/utypes.h"
     16 #include "unicode/uchar.h"
     17 #include "charstr.h"
     18 #include "cstring.h"
     19 #include "ppucd.h"
     20 #include "uassert.h"
     21 #include "uparse.h"
     22 
     23 #include <stdio.h>
     24 #include <string.h>
     25 
     26 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 PropertyNames::~PropertyNames() {}
     31 
     32 int32_t
     33 PropertyNames::getPropertyEnum(const char *name) const {
     34     return u_getPropertyEnum(name);
     35 }
     36 
     37 int32_t
     38 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
     39     return u_getPropertyValueEnum((UProperty)property, name);
     40 }
     41 
     42 UniProps::UniProps()
     43         : start(U_SENTINEL), end(U_SENTINEL),
     44           bmg(U_SENTINEL), bpb(U_SENTINEL),
     45           scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
     46           digitValue(-1), numericValue(NULL),
     47           name(NULL), nameAlias(NULL) {
     48     memset(binProps, 0, sizeof(binProps));
     49     memset(intProps, 0, sizeof(intProps));
     50     memset(age, 0, 4);
     51 }
     52 
     53 UniProps::~UniProps() {}
     54 
     55 const int32_t PreparsedUCD::kNumLineBuffers;
     56 
     57 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
     58         : icuPnames(new PropertyNames()), pnames(icuPnames),
     59           file(NULL),
     60           defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
     61           lineNumber(0),
     62           lineType(NO_LINE),
     63           fieldLimit(NULL), lineLimit(NULL) {
     64     if(U_FAILURE(errorCode)) { return; }
     65 
     66     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
     67         filename=NULL;
     68         file=stdin;
     69     } else {
     70         file=fopen(filename, "r");
     71     }
     72     if(file==NULL) {
     73         perror("error opening preparsed UCD");
     74         fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
     75         errorCode=U_FILE_ACCESS_ERROR;
     76         return;
     77     }
     78 
     79     memset(ucdVersion, 0, 4);
     80     lines[0][0]=0;
     81 }
     82 
     83 PreparsedUCD::~PreparsedUCD() {
     84     if(file!=stdin) {
     85         fclose(file);
     86     }
     87     delete icuPnames;
     88 }
     89 
     90 // Same order as the LineType values.
     91 static const char *lineTypeStrings[]={
     92     NULL,
     93     NULL,
     94     "ucd",
     95     "property",
     96     "binary",
     97     "value",
     98     "defaults",
     99     "block",
    100     "cp",
    101     "algnamesrange"
    102 };
    103 
    104 PreparsedUCD::LineType
    105 PreparsedUCD::readLine(UErrorCode &errorCode) {
    106     if(U_FAILURE(errorCode)) { return NO_LINE; }
    107     // Select the next available line buffer.
    108     while(!isLineBufferAvailable(lineIndex)) {
    109         ++lineIndex;
    110         if (lineIndex == kNumLineBuffers) {
    111             lineIndex = 0;
    112         }
    113     }
    114     char *line=lines[lineIndex];
    115     *line=0;
    116     lineLimit=fieldLimit=line;
    117     lineType=NO_LINE;
    118     char *result=fgets(line, sizeof(lines[0]), file);
    119     if(result==NULL) {
    120         if(ferror(file)) {
    121             perror("error reading preparsed UCD");
    122             fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
    123             errorCode=U_FILE_ACCESS_ERROR;
    124         }
    125         return NO_LINE;
    126     }
    127     ++lineNumber;
    128     if(*line=='#') {
    129         fieldLimit=strchr(line, 0);
    130         return lineType=EMPTY_LINE;
    131     }
    132     // Remove trailing /r/n.
    133     char c;
    134     char *limit=strchr(line, 0);
    135     while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
    136     // Remove trailing white space.
    137     while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
    138     *limit=0;
    139     lineLimit=limit;
    140     if(line==limit) {
    141         fieldLimit=limit;
    142         return lineType=EMPTY_LINE;
    143     }
    144     // Split by ';'.
    145     char *semi=line;
    146     while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
    147     fieldLimit=strchr(line, 0);
    148     // Determine the line type.
    149     int32_t type;
    150     for(type=EMPTY_LINE+1;; ++type) {
    151         if(type==LINE_TYPE_COUNT) {
    152             fprintf(stderr,
    153                     "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
    154                     line, (long)lineNumber);
    155             errorCode=U_PARSE_ERROR;
    156             return NO_LINE;
    157         }
    158         if(0==strcmp(line, lineTypeStrings[type])) {
    159             break;
    160         }
    161     }
    162     lineType=(LineType)type;
    163     if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
    164         u_versionFromString(ucdVersion, fieldLimit+1);
    165     }
    166     return lineType;
    167 }
    168 
    169 const char *
    170 PreparsedUCD::firstField() {
    171     char *field=lines[lineIndex];
    172     fieldLimit=strchr(field, 0);
    173     return field;
    174 }
    175 
    176 const char *
    177 PreparsedUCD::nextField() {
    178     if(fieldLimit==lineLimit) { return NULL; }
    179     char *field=fieldLimit+1;
    180     fieldLimit=strchr(field, 0);
    181     return field;
    182 }
    183 
    184 const UniProps *
    185 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
    186     if(U_FAILURE(errorCode)) { return NULL; }
    187     newValues.clear();
    188     if(!lineHasPropertyValues()) {
    189         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    190         return NULL;
    191     }
    192     firstField();
    193     const char *field=nextField();
    194     if(field==NULL) {
    195         // No range field after the type.
    196         fprintf(stderr,
    197                 "error in preparsed UCD: missing default/block/cp range field "
    198                 "(no second field) on line %ld\n",
    199                 (long)lineNumber);
    200         errorCode=U_PARSE_ERROR;
    201         return NULL;
    202     }
    203     UChar32 start, end;
    204     if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
    205     UniProps *props;
    206     switch(lineType) {
    207     case DEFAULTS_LINE:
    208         if(defaultLineIndex>=0) {
    209             fprintf(stderr,
    210                     "error in preparsed UCD: second line with default properties on line %ld\n",
    211                     (long)lineNumber);
    212             errorCode=U_PARSE_ERROR;
    213             return NULL;
    214         }
    215         if(start!=0 || end!=0x10ffff) {
    216             fprintf(stderr,
    217                     "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
    218                     field, (long)lineNumber);
    219             errorCode=U_PARSE_ERROR;
    220             return NULL;
    221         }
    222         props=&defaultProps;
    223         defaultLineIndex=lineIndex;
    224         break;
    225     case BLOCK_LINE:
    226         blockProps=defaultProps;  // Block inherits default properties.
    227         props=&blockProps;
    228         blockLineIndex=lineIndex;
    229         break;
    230     case CP_LINE:
    231         if(blockProps.start<=start && end<=blockProps.end) {
    232             // Code point range fully inside the last block inherits the block properties.
    233             cpProps=blockProps;
    234         } else if(start>blockProps.end || end<blockProps.start) {
    235             // Code point range fully outside the last block inherits the default properties.
    236             cpProps=defaultProps;
    237         } else {
    238             // Code point range partially overlapping with the last block is illegal.
    239             fprintf(stderr,
    240                     "error in preparsed UCD: cp range %s on line %ld only "
    241                     "partially overlaps with block range %04lX..%04lX\n",
    242                     field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
    243             errorCode=U_PARSE_ERROR;
    244             return NULL;
    245         }
    246         props=&cpProps;
    247         break;
    248     default:
    249         // Will not occur because of the range check above.
    250         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    251         return NULL;
    252     }
    253     props->start=start;
    254     props->end=end;
    255     while((field=nextField())!=NULL) {
    256         if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
    257     }
    258     return props;
    259 }
    260 
    261 static const struct {
    262     const char *name;
    263     int32_t prop;
    264 } ppucdProperties[]={
    265     { "Name_Alias", PPUCD_NAME_ALIAS },
    266     { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
    267     { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
    268 };
    269 
    270 // Returns TRUE for "ok to continue parsing fields".
    271 UBool
    272 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
    273                             UErrorCode &errorCode) {
    274     CharString pBuffer;
    275     const char *p=field;
    276     const char *v=strchr(p, '=');
    277     int binaryValue;
    278     if(*p=='-') {
    279         if(v!=NULL) {
    280             fprintf(stderr,
    281                     "error in preparsed UCD: mix of binary-property-no and "
    282                     "enum-property syntax '%s' on line %ld\n",
    283                     field, (long)lineNumber);
    284             errorCode=U_PARSE_ERROR;
    285             return FALSE;
    286         }
    287         binaryValue=0;
    288         ++p;
    289     } else if(v==NULL) {
    290         binaryValue=1;
    291     } else {
    292         binaryValue=-1;
    293         // Copy out the property name rather than modifying the field (writing a NUL).
    294         pBuffer.append(p, (int32_t)(v-p), errorCode);
    295         p=pBuffer.data();
    296         ++v;
    297     }
    298     int32_t prop=pnames->getPropertyEnum(p);
    299     if(prop<0) {
    300         for(int32_t i=0;; ++i) {
    301             if(i==LENGTHOF(ppucdProperties)) {
    302                 // Ignore unknown property names.
    303                 return TRUE;
    304             }
    305             if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
    306                 prop=ppucdProperties[i].prop;
    307                 U_ASSERT(prop>=0);
    308                 break;
    309             }
    310         }
    311     }
    312     if(prop<UCHAR_BINARY_LIMIT) {
    313         if(binaryValue>=0) {
    314             props.binProps[prop]=(UBool)binaryValue;
    315         } else {
    316             // No binary value for a binary property.
    317             fprintf(stderr,
    318                     "error in preparsed UCD: enum-property syntax '%s' "
    319                     "for binary property on line %ld\n",
    320                     field, (long)lineNumber);
    321             errorCode=U_PARSE_ERROR;
    322         }
    323     } else if(binaryValue>=0) {
    324         // Binary value for a non-binary property.
    325         fprintf(stderr,
    326                 "error in preparsed UCD: binary-property syntax '%s' "
    327                 "for non-binary property on line %ld\n",
    328                 field, (long)lineNumber);
    329         errorCode=U_PARSE_ERROR;
    330     } else if (prop < UCHAR_INT_START) {
    331         fprintf(stderr,
    332                 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
    333                 prop, (long)lineNumber);
    334         errorCode=U_PARSE_ERROR;
    335     } else if(prop<UCHAR_INT_LIMIT) {
    336         int32_t value=pnames->getPropertyValueEnum(prop, v);
    337         if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
    338             // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
    339             char *end;
    340             unsigned long ccc=uprv_strtoul(v, &end, 10);
    341             if(v<end && *end==0 && ccc<=254) {
    342                 value=(int32_t)ccc;
    343             }
    344         }
    345         if(value==UCHAR_INVALID_CODE) {
    346             fprintf(stderr,
    347                     "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
    348                     field, (long)lineNumber);
    349             errorCode=U_PARSE_ERROR;
    350         } else {
    351             props.intProps[prop-UCHAR_INT_START]=value;
    352         }
    353     } else if(*v=='<') {
    354         // Do not parse default values like <code point>, just set null values.
    355         switch(prop) {
    356         case UCHAR_BIDI_MIRRORING_GLYPH:
    357             props.bmg=U_SENTINEL;
    358             break;
    359         case UCHAR_BIDI_PAIRED_BRACKET:
    360             props.bpb=U_SENTINEL;
    361             break;
    362         case UCHAR_SIMPLE_CASE_FOLDING:
    363             props.scf=U_SENTINEL;
    364             break;
    365         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
    366             props.slc=U_SENTINEL;
    367             break;
    368         case UCHAR_SIMPLE_TITLECASE_MAPPING:
    369             props.stc=U_SENTINEL;
    370             break;
    371         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
    372             props.suc=U_SENTINEL;
    373             break;
    374         case UCHAR_CASE_FOLDING:
    375             props.cf.remove();
    376             break;
    377         case UCHAR_LOWERCASE_MAPPING:
    378             props.lc.remove();
    379             break;
    380         case UCHAR_TITLECASE_MAPPING:
    381             props.tc.remove();
    382             break;
    383         case UCHAR_UPPERCASE_MAPPING:
    384             props.uc.remove();
    385             break;
    386         case UCHAR_SCRIPT_EXTENSIONS:
    387             props.scx.clear();
    388             break;
    389         default:
    390             fprintf(stderr,
    391                     "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
    392                     field, (long)lineNumber);
    393             errorCode=U_PARSE_ERROR;
    394         }
    395     } else {
    396         char c;
    397         switch(prop) {
    398         case UCHAR_NUMERIC_VALUE:
    399             props.numericValue=v;
    400             c=*v;
    401             if('0'<=c && c<='9' && v[1]==0) {
    402                 props.digitValue=c-'0';
    403             } else {
    404                 props.digitValue=-1;
    405             }
    406             break;
    407         case UCHAR_NAME:
    408             props.name=v;
    409             break;
    410         case UCHAR_AGE:
    411             u_versionFromString(props.age, v);  // Writes 0.0.0.0 if v is not numeric.
    412             break;
    413         case UCHAR_BIDI_MIRRORING_GLYPH:
    414             props.bmg=parseCodePoint(v, errorCode);
    415             break;
    416         case UCHAR_BIDI_PAIRED_BRACKET:
    417             props.bpb=parseCodePoint(v, errorCode);
    418             break;
    419         case UCHAR_SIMPLE_CASE_FOLDING:
    420             props.scf=parseCodePoint(v, errorCode);
    421             break;
    422         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
    423             props.slc=parseCodePoint(v, errorCode);
    424             break;
    425         case UCHAR_SIMPLE_TITLECASE_MAPPING:
    426             props.stc=parseCodePoint(v, errorCode);
    427             break;
    428         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
    429             props.suc=parseCodePoint(v, errorCode);
    430             break;
    431         case UCHAR_CASE_FOLDING:
    432             parseString(v, props.cf, errorCode);
    433             break;
    434         case UCHAR_LOWERCASE_MAPPING:
    435             parseString(v, props.lc, errorCode);
    436             break;
    437         case UCHAR_TITLECASE_MAPPING:
    438             parseString(v, props.tc, errorCode);
    439             break;
    440         case UCHAR_UPPERCASE_MAPPING:
    441             parseString(v, props.uc, errorCode);
    442             break;
    443         case PPUCD_NAME_ALIAS:
    444             props.nameAlias=v;
    445             break;
    446         case PPUCD_CONDITIONAL_CASE_MAPPINGS:
    447         case PPUCD_TURKIC_CASE_FOLDING:
    448             // No need to parse their values: They are hardcoded in the runtime library.
    449             break;
    450         case UCHAR_SCRIPT_EXTENSIONS:
    451             parseScriptExtensions(v, props.scx, errorCode);
    452             break;
    453         default:
    454             // Ignore unhandled properties.
    455             return TRUE;
    456         }
    457     }
    458     if(U_SUCCESS(errorCode)) {
    459         newValues.add((UChar32)prop);
    460         return TRUE;
    461     } else {
    462         return FALSE;
    463     }
    464 }
    465 
    466 UBool
    467 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
    468     if(U_FAILURE(errorCode)) { return FALSE; }
    469     if(lineType!=ALG_NAMES_RANGE_LINE) {
    470         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    471         return FALSE;
    472     }
    473     firstField();
    474     const char *field=nextField();
    475     if(field==NULL) {
    476         // No range field after the type.
    477         fprintf(stderr,
    478                 "error in preparsed UCD: missing algnamesrange range field "
    479                 "(no second field) on line %ld\n",
    480                 (long)lineNumber);
    481         errorCode=U_PARSE_ERROR;
    482         return FALSE;
    483     }
    484     return parseCodePointRange(field, start, end, errorCode);
    485 }
    486 
    487 UChar32
    488 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
    489     char *end;
    490     uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
    491     if(end<=s || *end!=0 || value>=0x110000) {
    492         fprintf(stderr,
    493                 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
    494                 s, (long)lineNumber);
    495         errorCode=U_PARSE_ERROR;
    496         return U_SENTINEL;
    497     }
    498     return (UChar32)value;
    499 }
    500 
    501 UBool
    502 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
    503     uint32_t st, e;
    504     u_parseCodePointRange(s, &st, &e, &errorCode);
    505     if(U_FAILURE(errorCode)) {
    506         fprintf(stderr,
    507                 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
    508                 s, (long)lineNumber);
    509         return FALSE;
    510     }
    511     start=(UChar32)st;
    512     end=(UChar32)e;
    513     return TRUE;
    514 }
    515 
    516 void
    517 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
    518     UChar *buffer=uni.getBuffer(-1);
    519     int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
    520     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    521         errorCode=U_ZERO_ERROR;
    522         uni.releaseBuffer(0);
    523         buffer=uni.getBuffer(length);
    524         length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
    525     }
    526     uni.releaseBuffer(length);
    527     if(U_FAILURE(errorCode)) {
    528         fprintf(stderr,
    529                 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
    530                 s, (long)lineNumber);
    531     }
    532 }
    533 
    534 void
    535 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
    536     if(U_FAILURE(errorCode)) { return; }
    537     scx.clear();
    538     CharString scString;
    539     for(;;) {
    540         const char *scs;
    541         const char *scLimit=strchr(s, ' ');
    542         if(scLimit!=NULL) {
    543             scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
    544             if(U_FAILURE(errorCode)) { return; }
    545         } else {
    546             scs=s;
    547         }
    548         int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
    549         if(script==UCHAR_INVALID_CODE) {
    550             fprintf(stderr,
    551                     "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
    552                     scs, (long)lineNumber);
    553             errorCode=U_PARSE_ERROR;
    554             return;
    555         } else if(scx.contains(script)) {
    556             fprintf(stderr,
    557                     "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
    558                     scs, (long)lineNumber);
    559             errorCode=U_PARSE_ERROR;
    560             return;
    561         } else {
    562             scx.add(script);
    563         }
    564         if(scLimit!=NULL) {
    565             s=scLimit+1;
    566         } else {
    567             break;
    568         }
    569     }
    570     if(scx.isEmpty()) {
    571         fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
    572         errorCode=U_PARSE_ERROR;
    573     }
    574 }
    575 
    576 U_NAMESPACE_END
    577