Home | History | Annotate | Download | only in toolutil
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *   Copyright (C) 2011-2014, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 *   file name:  ppucd.cpp
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2011dec11
     14 *   created by: Markus W. Scherer
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 #include "unicode/uchar.h"
     19 #include "charstr.h"
     20 #include "cstring.h"
     21 #include "ppucd.h"
     22 #include "uassert.h"
     23 #include "uparse.h"
     24 
     25 #include <stdio.h>
     26 #include <string.h>
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 PropertyNames::~PropertyNames() {}
     31 
     32 int32_t
     33 PropertyNames::getPropertyEnum(const char *name) const {
     34     return u_getPropertyEnum(name);
     35 }
     36 
     37 int32_t
     38 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const {
     39     return u_getPropertyValueEnum((UProperty)property, name);
     40 }
     41 
     42 UniProps::UniProps()
     43         : start(U_SENTINEL), end(U_SENTINEL),
     44           bmg(U_SENTINEL), bpb(U_SENTINEL),
     45           scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL),
     46           digitValue(-1), numericValue(NULL),
     47           name(NULL), nameAlias(NULL) {
     48     memset(binProps, 0, sizeof(binProps));
     49     memset(intProps, 0, sizeof(intProps));
     50     memset(age, 0, 4);
     51 }
     52 
     53 UniProps::~UniProps() {}
     54 
     55 const int32_t PreparsedUCD::kNumLineBuffers;
     56 
     57 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode)
     58         : icuPnames(new PropertyNames()), pnames(icuPnames),
     59           file(NULL),
     60           defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0),
     61           lineNumber(0),
     62           lineType(NO_LINE),
     63           fieldLimit(NULL), lineLimit(NULL) {
     64     if(U_FAILURE(errorCode)) { return; }
     65 
     66     if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) {
     67         filename=NULL;
     68         file=stdin;
     69     } else {
     70         file=fopen(filename, "r");
     71     }
     72     if(file==NULL) {
     73         perror("error opening preparsed UCD");
     74         fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\"");
     75         errorCode=U_FILE_ACCESS_ERROR;
     76         return;
     77     }
     78 
     79     memset(ucdVersion, 0, 4);
     80     lines[0][0]=0;
     81 }
     82 
     83 PreparsedUCD::~PreparsedUCD() {
     84     if(file!=stdin) {
     85         fclose(file);
     86     }
     87     delete icuPnames;
     88 }
     89 
     90 // Same order as the LineType values.
     91 static const char *lineTypeStrings[]={
     92     NULL,
     93     NULL,
     94     "ucd",
     95     "property",
     96     "binary",
     97     "value",
     98     "defaults",
     99     "block",
    100     "cp",
    101     "unassigned",
    102     "algnamesrange"
    103 };
    104 
    105 PreparsedUCD::LineType
    106 PreparsedUCD::readLine(UErrorCode &errorCode) {
    107     if(U_FAILURE(errorCode)) { return NO_LINE; }
    108     // Select the next available line buffer.
    109     while(!isLineBufferAvailable(lineIndex)) {
    110         ++lineIndex;
    111         if (lineIndex == kNumLineBuffers) {
    112             lineIndex = 0;
    113         }
    114     }
    115     char *line=lines[lineIndex];
    116     *line=0;
    117     lineLimit=fieldLimit=line;
    118     lineType=NO_LINE;
    119     char *result=fgets(line, sizeof(lines[0]), file);
    120     if(result==NULL) {
    121         if(ferror(file)) {
    122             perror("error reading preparsed UCD");
    123             fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber);
    124             errorCode=U_FILE_ACCESS_ERROR;
    125         }
    126         return NO_LINE;
    127     }
    128     ++lineNumber;
    129     if(*line=='#') {
    130         fieldLimit=strchr(line, 0);
    131         return lineType=EMPTY_LINE;
    132     }
    133     // Remove trailing /r/n.
    134     char c;
    135     char *limit=strchr(line, 0);
    136     while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; }
    137     // Remove trailing white space.
    138     while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; }
    139     *limit=0;
    140     lineLimit=limit;
    141     if(line==limit) {
    142         fieldLimit=limit;
    143         return lineType=EMPTY_LINE;
    144     }
    145     // Split by ';'.
    146     char *semi=line;
    147     while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; }
    148     fieldLimit=strchr(line, 0);
    149     // Determine the line type.
    150     int32_t type;
    151     for(type=EMPTY_LINE+1;; ++type) {
    152         if(type==LINE_TYPE_COUNT) {
    153             fprintf(stderr,
    154                     "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n",
    155                     line, (long)lineNumber);
    156             errorCode=U_PARSE_ERROR;
    157             return NO_LINE;
    158         }
    159         if(0==strcmp(line, lineTypeStrings[type])) {
    160             break;
    161         }
    162     }
    163     lineType=(LineType)type;
    164     if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) {
    165         u_versionFromString(ucdVersion, fieldLimit+1);
    166     }
    167     return lineType;
    168 }
    169 
    170 const char *
    171 PreparsedUCD::firstField() {
    172     char *field=lines[lineIndex];
    173     fieldLimit=strchr(field, 0);
    174     return field;
    175 }
    176 
    177 const char *
    178 PreparsedUCD::nextField() {
    179     if(fieldLimit==lineLimit) { return NULL; }
    180     char *field=fieldLimit+1;
    181     fieldLimit=strchr(field, 0);
    182     return field;
    183 }
    184 
    185 const UniProps *
    186 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) {
    187     if(U_FAILURE(errorCode)) { return NULL; }
    188     newValues.clear();
    189     if(!lineHasPropertyValues()) {
    190         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    191         return NULL;
    192     }
    193     firstField();
    194     const char *field=nextField();
    195     if(field==NULL) {
    196         // No range field after the type.
    197         fprintf(stderr,
    198                 "error in preparsed UCD: missing default/block/cp range field "
    199                 "(no second field) on line %ld\n",
    200                 (long)lineNumber);
    201         errorCode=U_PARSE_ERROR;
    202         return NULL;
    203     }
    204     UChar32 start, end;
    205     if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; }
    206     UniProps *props;
    207     UBool insideBlock=FALSE;  // TRUE if cp or unassigned range inside the block range.
    208     switch(lineType) {
    209     case DEFAULTS_LINE:
    210         // Should occur before any block/cp/unassigned line.
    211         if(blockLineIndex>=0) {
    212             fprintf(stderr,
    213                     "error in preparsed UCD: default line %ld after one or more block lines\n",
    214                     (long)lineNumber);
    215             errorCode=U_PARSE_ERROR;
    216             return NULL;
    217         }
    218         if(defaultLineIndex>=0) {
    219             fprintf(stderr,
    220                     "error in preparsed UCD: second line with default properties on line %ld\n",
    221                     (long)lineNumber);
    222             errorCode=U_PARSE_ERROR;
    223             return NULL;
    224         }
    225         if(start!=0 || end!=0x10ffff) {
    226             fprintf(stderr,
    227                     "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n",
    228                     field, (long)lineNumber);
    229             errorCode=U_PARSE_ERROR;
    230             return NULL;
    231         }
    232         props=&defaultProps;
    233         defaultLineIndex=lineIndex;
    234         break;
    235     case BLOCK_LINE:
    236         blockProps=defaultProps;  // Block inherits default properties.
    237         props=&blockProps;
    238         blockLineIndex=lineIndex;
    239         break;
    240     case CP_LINE:
    241     case UNASSIGNED_LINE:
    242         if(blockProps.start<=start && end<=blockProps.end) {
    243             insideBlock=TRUE;
    244             if(lineType==CP_LINE) {
    245                 // Code point range fully inside the last block inherits the block properties.
    246                 cpProps=blockProps;
    247             } else {
    248                 // Unassigned line inside the block is based on default properties
    249                 // which override block properties.
    250                 cpProps=defaultProps;
    251                 newValues=blockValues;
    252                 // Except, it inherits the one blk=Block property.
    253                 int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START;
    254                 cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex];
    255                 newValues.remove((UChar32)UCHAR_BLOCK);
    256             }
    257         } else if(start>blockProps.end || end<blockProps.start) {
    258             // Code point range fully outside the last block inherits the default properties.
    259             cpProps=defaultProps;
    260         } else {
    261             // Code point range partially overlapping with the last block is illegal.
    262             fprintf(stderr,
    263                     "error in preparsed UCD: cp range %s on line %ld only "
    264                     "partially overlaps with block range %04lX..%04lX\n",
    265                     field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end);
    266             errorCode=U_PARSE_ERROR;
    267             return NULL;
    268         }
    269         props=&cpProps;
    270         break;
    271     default:
    272         // Will not occur because of the range check above.
    273         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    274         return NULL;
    275     }
    276     props->start=start;
    277     props->end=end;
    278     while((field=nextField())!=NULL) {
    279         if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; }
    280     }
    281     if(lineType==BLOCK_LINE) {
    282         blockValues=newValues;
    283     } else if(lineType==UNASSIGNED_LINE && insideBlock) {
    284         // Unset newValues for values that are the same as the block values.
    285         for(int32_t prop=0; prop<UCHAR_BINARY_LIMIT; ++prop) {
    286             if(newValues.contains(prop) && cpProps.binProps[prop]==blockProps.binProps[prop]) {
    287                 newValues.remove(prop);
    288             }
    289         }
    290         for(int32_t prop=UCHAR_INT_START; prop<UCHAR_INT_LIMIT; ++prop) {
    291             int32_t index=prop-UCHAR_INT_START;
    292             if(newValues.contains(prop) && cpProps.intProps[index]==blockProps.intProps[index]) {
    293                 newValues.remove(prop);
    294             }
    295         }
    296     }
    297     return props;
    298 }
    299 
    300 static const struct {
    301     const char *name;
    302     int32_t prop;
    303 } ppucdProperties[]={
    304     { "Name_Alias", PPUCD_NAME_ALIAS },
    305     { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS },
    306     { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING }
    307 };
    308 
    309 // Returns TRUE for "ok to continue parsing fields".
    310 UBool
    311 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues,
    312                             UErrorCode &errorCode) {
    313     CharString pBuffer;
    314     const char *p=field;
    315     const char *v=strchr(p, '=');
    316     int binaryValue;
    317     if(*p=='-') {
    318         if(v!=NULL) {
    319             fprintf(stderr,
    320                     "error in preparsed UCD: mix of binary-property-no and "
    321                     "enum-property syntax '%s' on line %ld\n",
    322                     field, (long)lineNumber);
    323             errorCode=U_PARSE_ERROR;
    324             return FALSE;
    325         }
    326         binaryValue=0;
    327         ++p;
    328     } else if(v==NULL) {
    329         binaryValue=1;
    330     } else {
    331         binaryValue=-1;
    332         // Copy out the property name rather than modifying the field (writing a NUL).
    333         pBuffer.append(p, (int32_t)(v-p), errorCode);
    334         p=pBuffer.data();
    335         ++v;
    336     }
    337     int32_t prop=pnames->getPropertyEnum(p);
    338     if(prop<0) {
    339         for(int32_t i=0;; ++i) {
    340             if(i==UPRV_LENGTHOF(ppucdProperties)) {
    341                 // Ignore unknown property names.
    342                 return TRUE;
    343             }
    344             if(0==uprv_stricmp(p, ppucdProperties[i].name)) {
    345                 prop=ppucdProperties[i].prop;
    346                 U_ASSERT(prop>=0);
    347                 break;
    348             }
    349         }
    350     }
    351     if(prop<UCHAR_BINARY_LIMIT) {
    352         if(binaryValue>=0) {
    353             props.binProps[prop]=(UBool)binaryValue;
    354         } else {
    355             // No binary value for a binary property.
    356             fprintf(stderr,
    357                     "error in preparsed UCD: enum-property syntax '%s' "
    358                     "for binary property on line %ld\n",
    359                     field, (long)lineNumber);
    360             errorCode=U_PARSE_ERROR;
    361         }
    362     } else if(binaryValue>=0) {
    363         // Binary value for a non-binary property.
    364         fprintf(stderr,
    365                 "error in preparsed UCD: binary-property syntax '%s' "
    366                 "for non-binary property on line %ld\n",
    367                 field, (long)lineNumber);
    368         errorCode=U_PARSE_ERROR;
    369     } else if (prop < UCHAR_INT_START) {
    370         fprintf(stderr,
    371                 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n",
    372                 prop, (long)lineNumber);
    373         errorCode=U_PARSE_ERROR;
    374     } else if(prop<UCHAR_INT_LIMIT) {
    375         int32_t value=pnames->getPropertyValueEnum(prop, v);
    376         if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) {
    377             // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work.
    378             char *end;
    379             unsigned long ccc=uprv_strtoul(v, &end, 10);
    380             if(v<end && *end==0 && ccc<=254) {
    381                 value=(int32_t)ccc;
    382             }
    383         }
    384         if(value==UCHAR_INVALID_CODE) {
    385             fprintf(stderr,
    386                     "error in preparsed UCD: '%s' is not a valid value on line %ld\n",
    387                     field, (long)lineNumber);
    388             errorCode=U_PARSE_ERROR;
    389         } else {
    390             props.intProps[prop-UCHAR_INT_START]=value;
    391         }
    392     } else if(*v=='<') {
    393         // Do not parse default values like <code point>, just set null values.
    394         switch(prop) {
    395         case UCHAR_BIDI_MIRRORING_GLYPH:
    396             props.bmg=U_SENTINEL;
    397             break;
    398         case UCHAR_BIDI_PAIRED_BRACKET:
    399             props.bpb=U_SENTINEL;
    400             break;
    401         case UCHAR_SIMPLE_CASE_FOLDING:
    402             props.scf=U_SENTINEL;
    403             break;
    404         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
    405             props.slc=U_SENTINEL;
    406             break;
    407         case UCHAR_SIMPLE_TITLECASE_MAPPING:
    408             props.stc=U_SENTINEL;
    409             break;
    410         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
    411             props.suc=U_SENTINEL;
    412             break;
    413         case UCHAR_CASE_FOLDING:
    414             props.cf.remove();
    415             break;
    416         case UCHAR_LOWERCASE_MAPPING:
    417             props.lc.remove();
    418             break;
    419         case UCHAR_TITLECASE_MAPPING:
    420             props.tc.remove();
    421             break;
    422         case UCHAR_UPPERCASE_MAPPING:
    423             props.uc.remove();
    424             break;
    425         case UCHAR_SCRIPT_EXTENSIONS:
    426             props.scx.clear();
    427             break;
    428         default:
    429             fprintf(stderr,
    430                     "error in preparsed UCD: '%s' is not a valid default value on line %ld\n",
    431                     field, (long)lineNumber);
    432             errorCode=U_PARSE_ERROR;
    433         }
    434     } else {
    435         char c;
    436         switch(prop) {
    437         case UCHAR_NUMERIC_VALUE:
    438             props.numericValue=v;
    439             c=*v;
    440             if('0'<=c && c<='9' && v[1]==0) {
    441                 props.digitValue=c-'0';
    442             } else {
    443                 props.digitValue=-1;
    444             }
    445             break;
    446         case UCHAR_NAME:
    447             props.name=v;
    448             break;
    449         case UCHAR_AGE:
    450             u_versionFromString(props.age, v);  // Writes 0.0.0.0 if v is not numeric.
    451             break;
    452         case UCHAR_BIDI_MIRRORING_GLYPH:
    453             props.bmg=parseCodePoint(v, errorCode);
    454             break;
    455         case UCHAR_BIDI_PAIRED_BRACKET:
    456             props.bpb=parseCodePoint(v, errorCode);
    457             break;
    458         case UCHAR_SIMPLE_CASE_FOLDING:
    459             props.scf=parseCodePoint(v, errorCode);
    460             break;
    461         case UCHAR_SIMPLE_LOWERCASE_MAPPING:
    462             props.slc=parseCodePoint(v, errorCode);
    463             break;
    464         case UCHAR_SIMPLE_TITLECASE_MAPPING:
    465             props.stc=parseCodePoint(v, errorCode);
    466             break;
    467         case UCHAR_SIMPLE_UPPERCASE_MAPPING:
    468             props.suc=parseCodePoint(v, errorCode);
    469             break;
    470         case UCHAR_CASE_FOLDING:
    471             parseString(v, props.cf, errorCode);
    472             break;
    473         case UCHAR_LOWERCASE_MAPPING:
    474             parseString(v, props.lc, errorCode);
    475             break;
    476         case UCHAR_TITLECASE_MAPPING:
    477             parseString(v, props.tc, errorCode);
    478             break;
    479         case UCHAR_UPPERCASE_MAPPING:
    480             parseString(v, props.uc, errorCode);
    481             break;
    482         case PPUCD_NAME_ALIAS:
    483             props.nameAlias=v;
    484             break;
    485         case PPUCD_CONDITIONAL_CASE_MAPPINGS:
    486         case PPUCD_TURKIC_CASE_FOLDING:
    487             // No need to parse their values: They are hardcoded in the runtime library.
    488             break;
    489         case UCHAR_SCRIPT_EXTENSIONS:
    490             parseScriptExtensions(v, props.scx, errorCode);
    491             break;
    492         default:
    493             // Ignore unhandled properties.
    494             return TRUE;
    495         }
    496     }
    497     if(U_SUCCESS(errorCode)) {
    498         newValues.add((UChar32)prop);
    499         return TRUE;
    500     } else {
    501         return FALSE;
    502     }
    503 }
    504 
    505 UBool
    506 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
    507     if(U_FAILURE(errorCode)) { return FALSE; }
    508     if(lineType!=ALG_NAMES_RANGE_LINE) {
    509         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
    510         return FALSE;
    511     }
    512     firstField();
    513     const char *field=nextField();
    514     if(field==NULL) {
    515         // No range field after the type.
    516         fprintf(stderr,
    517                 "error in preparsed UCD: missing algnamesrange range field "
    518                 "(no second field) on line %ld\n",
    519                 (long)lineNumber);
    520         errorCode=U_PARSE_ERROR;
    521         return FALSE;
    522     }
    523     return parseCodePointRange(field, start, end, errorCode);
    524 }
    525 
    526 UChar32
    527 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) {
    528     char *end;
    529     uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16);
    530     if(end<=s || *end!=0 || value>=0x110000) {
    531         fprintf(stderr,
    532                 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n",
    533                 s, (long)lineNumber);
    534         errorCode=U_PARSE_ERROR;
    535         return U_SENTINEL;
    536     }
    537     return (UChar32)value;
    538 }
    539 
    540 UBool
    541 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) {
    542     uint32_t st, e;
    543     u_parseCodePointRange(s, &st, &e, &errorCode);
    544     if(U_FAILURE(errorCode)) {
    545         fprintf(stderr,
    546                 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n",
    547                 s, (long)lineNumber);
    548         return FALSE;
    549     }
    550     start=(UChar32)st;
    551     end=(UChar32)e;
    552     return TRUE;
    553 }
    554 
    555 void
    556 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) {
    557     UChar *buffer=toUCharPtr(uni.getBuffer(-1));
    558     int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
    559     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
    560         errorCode=U_ZERO_ERROR;
    561         uni.releaseBuffer(0);
    562         buffer=toUCharPtr(uni.getBuffer(length));
    563         length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode);
    564     }
    565     uni.releaseBuffer(length);
    566     if(U_FAILURE(errorCode)) {
    567         fprintf(stderr,
    568                 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n",
    569                 s, (long)lineNumber);
    570     }
    571 }
    572 
    573 void
    574 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) {
    575     if(U_FAILURE(errorCode)) { return; }
    576     scx.clear();
    577     CharString scString;
    578     for(;;) {
    579         const char *scs;
    580         const char *scLimit=strchr(s, ' ');
    581         if(scLimit!=NULL) {
    582             scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data();
    583             if(U_FAILURE(errorCode)) { return; }
    584         } else {
    585             scs=s;
    586         }
    587         int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs);
    588         if(script==UCHAR_INVALID_CODE) {
    589             fprintf(stderr,
    590                     "error in preparsed UCD: '%s' is not a valid script code on line %ld\n",
    591                     scs, (long)lineNumber);
    592             errorCode=U_PARSE_ERROR;
    593             return;
    594         } else if(scx.contains(script)) {
    595             fprintf(stderr,
    596                     "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n",
    597                     scs, (long)lineNumber);
    598             errorCode=U_PARSE_ERROR;
    599             return;
    600         } else {
    601             scx.add(script);
    602         }
    603         if(scLimit!=NULL) {
    604             s=scLimit+1;
    605         } else {
    606             break;
    607         }
    608     }
    609     if(scx.isEmpty()) {
    610         fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber);
    611         errorCode=U_PARSE_ERROR;
    612     }
    613 }
    614 
    615 U_NAMESPACE_END
    616