1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2011-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: ppucd.cpp 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2011dec11 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 #include "unicode/uchar.h" 19 #include "charstr.h" 20 #include "cstring.h" 21 #include "ppucd.h" 22 #include "uassert.h" 23 #include "uparse.h" 24 25 #include <stdio.h> 26 #include <string.h> 27 28 U_NAMESPACE_BEGIN 29 30 PropertyNames::~PropertyNames() {} 31 32 int32_t 33 PropertyNames::getPropertyEnum(const char *name) const { 34 return u_getPropertyEnum(name); 35 } 36 37 int32_t 38 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const { 39 return u_getPropertyValueEnum((UProperty)property, name); 40 } 41 42 UniProps::UniProps() 43 : start(U_SENTINEL), end(U_SENTINEL), 44 bmg(U_SENTINEL), bpb(U_SENTINEL), 45 scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL), 46 digitValue(-1), numericValue(NULL), 47 name(NULL), nameAlias(NULL) { 48 memset(binProps, 0, sizeof(binProps)); 49 memset(intProps, 0, sizeof(intProps)); 50 memset(age, 0, 4); 51 } 52 53 UniProps::~UniProps() {} 54 55 const int32_t PreparsedUCD::kNumLineBuffers; 56 57 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode) 58 : icuPnames(new PropertyNames()), pnames(icuPnames), 59 file(NULL), 60 defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0), 61 lineNumber(0), 62 lineType(NO_LINE), 63 fieldLimit(NULL), lineLimit(NULL) { 64 if(U_FAILURE(errorCode)) { return; } 65 66 if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) { 67 filename=NULL; 68 file=stdin; 69 } else { 70 file=fopen(filename, "r"); 71 } 72 if(file==NULL) { 73 perror("error opening preparsed UCD"); 74 fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\""); 75 errorCode=U_FILE_ACCESS_ERROR; 76 return; 77 } 78 79 memset(ucdVersion, 0, 4); 80 lines[0][0]=0; 81 } 82 83 PreparsedUCD::~PreparsedUCD() { 84 if(file!=stdin) { 85 fclose(file); 86 } 87 delete icuPnames; 88 } 89 90 // Same order as the LineType values. 91 static const char *lineTypeStrings[]={ 92 NULL, 93 NULL, 94 "ucd", 95 "property", 96 "binary", 97 "value", 98 "defaults", 99 "block", 100 "cp", 101 "unassigned", 102 "algnamesrange" 103 }; 104 105 PreparsedUCD::LineType 106 PreparsedUCD::readLine(UErrorCode &errorCode) { 107 if(U_FAILURE(errorCode)) { return NO_LINE; } 108 // Select the next available line buffer. 109 while(!isLineBufferAvailable(lineIndex)) { 110 ++lineIndex; 111 if (lineIndex == kNumLineBuffers) { 112 lineIndex = 0; 113 } 114 } 115 char *line=lines[lineIndex]; 116 *line=0; 117 lineLimit=fieldLimit=line; 118 lineType=NO_LINE; 119 char *result=fgets(line, sizeof(lines[0]), file); 120 if(result==NULL) { 121 if(ferror(file)) { 122 perror("error reading preparsed UCD"); 123 fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber); 124 errorCode=U_FILE_ACCESS_ERROR; 125 } 126 return NO_LINE; 127 } 128 ++lineNumber; 129 if(*line=='#') { 130 fieldLimit=strchr(line, 0); 131 return lineType=EMPTY_LINE; 132 } 133 // Remove trailing /r/n. 134 char c; 135 char *limit=strchr(line, 0); 136 while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; } 137 // Remove trailing white space. 138 while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; } 139 *limit=0; 140 lineLimit=limit; 141 if(line==limit) { 142 fieldLimit=limit; 143 return lineType=EMPTY_LINE; 144 } 145 // Split by ';'. 146 char *semi=line; 147 while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; } 148 fieldLimit=strchr(line, 0); 149 // Determine the line type. 150 int32_t type; 151 for(type=EMPTY_LINE+1;; ++type) { 152 if(type==LINE_TYPE_COUNT) { 153 fprintf(stderr, 154 "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n", 155 line, (long)lineNumber); 156 errorCode=U_PARSE_ERROR; 157 return NO_LINE; 158 } 159 if(0==strcmp(line, lineTypeStrings[type])) { 160 break; 161 } 162 } 163 lineType=(LineType)type; 164 if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) { 165 u_versionFromString(ucdVersion, fieldLimit+1); 166 } 167 return lineType; 168 } 169 170 const char * 171 PreparsedUCD::firstField() { 172 char *field=lines[lineIndex]; 173 fieldLimit=strchr(field, 0); 174 return field; 175 } 176 177 const char * 178 PreparsedUCD::nextField() { 179 if(fieldLimit==lineLimit) { return NULL; } 180 char *field=fieldLimit+1; 181 fieldLimit=strchr(field, 0); 182 return field; 183 } 184 185 const UniProps * 186 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) { 187 if(U_FAILURE(errorCode)) { return NULL; } 188 newValues.clear(); 189 if(!lineHasPropertyValues()) { 190 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 191 return NULL; 192 } 193 firstField(); 194 const char *field=nextField(); 195 if(field==NULL) { 196 // No range field after the type. 197 fprintf(stderr, 198 "error in preparsed UCD: missing default/block/cp range field " 199 "(no second field) on line %ld\n", 200 (long)lineNumber); 201 errorCode=U_PARSE_ERROR; 202 return NULL; 203 } 204 UChar32 start, end; 205 if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; } 206 UniProps *props; 207 UBool insideBlock=FALSE; // TRUE if cp or unassigned range inside the block range. 208 switch(lineType) { 209 case DEFAULTS_LINE: 210 // Should occur before any block/cp/unassigned line. 211 if(blockLineIndex>=0) { 212 fprintf(stderr, 213 "error in preparsed UCD: default line %ld after one or more block lines\n", 214 (long)lineNumber); 215 errorCode=U_PARSE_ERROR; 216 return NULL; 217 } 218 if(defaultLineIndex>=0) { 219 fprintf(stderr, 220 "error in preparsed UCD: second line with default properties on line %ld\n", 221 (long)lineNumber); 222 errorCode=U_PARSE_ERROR; 223 return NULL; 224 } 225 if(start!=0 || end!=0x10ffff) { 226 fprintf(stderr, 227 "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n", 228 field, (long)lineNumber); 229 errorCode=U_PARSE_ERROR; 230 return NULL; 231 } 232 props=&defaultProps; 233 defaultLineIndex=lineIndex; 234 break; 235 case BLOCK_LINE: 236 blockProps=defaultProps; // Block inherits default properties. 237 props=&blockProps; 238 blockLineIndex=lineIndex; 239 break; 240 case CP_LINE: 241 case UNASSIGNED_LINE: 242 if(blockProps.start<=start && end<=blockProps.end) { 243 insideBlock=TRUE; 244 if(lineType==CP_LINE) { 245 // Code point range fully inside the last block inherits the block properties. 246 cpProps=blockProps; 247 } else { 248 // Unassigned line inside the block is based on default properties 249 // which override block properties. 250 cpProps=defaultProps; 251 newValues=blockValues; 252 // Except, it inherits the one blk=Block property. 253 int32_t blkIndex=UCHAR_BLOCK-UCHAR_INT_START; 254 cpProps.intProps[blkIndex]=blockProps.intProps[blkIndex]; 255 newValues.remove((UChar32)UCHAR_BLOCK); 256 } 257 } else if(start>blockProps.end || end<blockProps.start) { 258 // Code point range fully outside the last block inherits the default properties. 259 cpProps=defaultProps; 260 } else { 261 // Code point range partially overlapping with the last block is illegal. 262 fprintf(stderr, 263 "error in preparsed UCD: cp range %s on line %ld only " 264 "partially overlaps with block range %04lX..%04lX\n", 265 field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end); 266 errorCode=U_PARSE_ERROR; 267 return NULL; 268 } 269 props=&cpProps; 270 break; 271 default: 272 // Will not occur because of the range check above. 273 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 274 return NULL; 275 } 276 props->start=start; 277 props->end=end; 278 while((field=nextField())!=NULL) { 279 if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; } 280 } 281 if(lineType==BLOCK_LINE) { 282 blockValues=newValues; 283 } else if(lineType==UNASSIGNED_LINE && insideBlock) { 284 // Unset newValues for values that are the same as the block values. 285 for(int32_t prop=0; prop<UCHAR_BINARY_LIMIT; ++prop) { 286 if(newValues.contains(prop) && cpProps.binProps[prop]==blockProps.binProps[prop]) { 287 newValues.remove(prop); 288 } 289 } 290 for(int32_t prop=UCHAR_INT_START; prop<UCHAR_INT_LIMIT; ++prop) { 291 int32_t index=prop-UCHAR_INT_START; 292 if(newValues.contains(prop) && cpProps.intProps[index]==blockProps.intProps[index]) { 293 newValues.remove(prop); 294 } 295 } 296 } 297 return props; 298 } 299 300 static const struct { 301 const char *name; 302 int32_t prop; 303 } ppucdProperties[]={ 304 { "Name_Alias", PPUCD_NAME_ALIAS }, 305 { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS }, 306 { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING } 307 }; 308 309 // Returns TRUE for "ok to continue parsing fields". 310 UBool 311 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, 312 UErrorCode &errorCode) { 313 CharString pBuffer; 314 const char *p=field; 315 const char *v=strchr(p, '='); 316 int binaryValue; 317 if(*p=='-') { 318 if(v!=NULL) { 319 fprintf(stderr, 320 "error in preparsed UCD: mix of binary-property-no and " 321 "enum-property syntax '%s' on line %ld\n", 322 field, (long)lineNumber); 323 errorCode=U_PARSE_ERROR; 324 return FALSE; 325 } 326 binaryValue=0; 327 ++p; 328 } else if(v==NULL) { 329 binaryValue=1; 330 } else { 331 binaryValue=-1; 332 // Copy out the property name rather than modifying the field (writing a NUL). 333 pBuffer.append(p, (int32_t)(v-p), errorCode); 334 p=pBuffer.data(); 335 ++v; 336 } 337 int32_t prop=pnames->getPropertyEnum(p); 338 if(prop<0) { 339 for(int32_t i=0;; ++i) { 340 if(i==UPRV_LENGTHOF(ppucdProperties)) { 341 // Ignore unknown property names. 342 return TRUE; 343 } 344 if(0==uprv_stricmp(p, ppucdProperties[i].name)) { 345 prop=ppucdProperties[i].prop; 346 U_ASSERT(prop>=0); 347 break; 348 } 349 } 350 } 351 if(prop<UCHAR_BINARY_LIMIT) { 352 if(binaryValue>=0) { 353 props.binProps[prop]=(UBool)binaryValue; 354 } else { 355 // No binary value for a binary property. 356 fprintf(stderr, 357 "error in preparsed UCD: enum-property syntax '%s' " 358 "for binary property on line %ld\n", 359 field, (long)lineNumber); 360 errorCode=U_PARSE_ERROR; 361 } 362 } else if(binaryValue>=0) { 363 // Binary value for a non-binary property. 364 fprintf(stderr, 365 "error in preparsed UCD: binary-property syntax '%s' " 366 "for non-binary property on line %ld\n", 367 field, (long)lineNumber); 368 errorCode=U_PARSE_ERROR; 369 } else if (prop < UCHAR_INT_START) { 370 fprintf(stderr, 371 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n", 372 prop, (long)lineNumber); 373 errorCode=U_PARSE_ERROR; 374 } else if(prop<UCHAR_INT_LIMIT) { 375 int32_t value=pnames->getPropertyValueEnum(prop, v); 376 if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) { 377 // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work. 378 char *end; 379 unsigned long ccc=uprv_strtoul(v, &end, 10); 380 if(v<end && *end==0 && ccc<=254) { 381 value=(int32_t)ccc; 382 } 383 } 384 if(value==UCHAR_INVALID_CODE) { 385 fprintf(stderr, 386 "error in preparsed UCD: '%s' is not a valid value on line %ld\n", 387 field, (long)lineNumber); 388 errorCode=U_PARSE_ERROR; 389 } else { 390 props.intProps[prop-UCHAR_INT_START]=value; 391 } 392 } else if(*v=='<') { 393 // Do not parse default values like <code point>, just set null values. 394 switch(prop) { 395 case UCHAR_BIDI_MIRRORING_GLYPH: 396 props.bmg=U_SENTINEL; 397 break; 398 case UCHAR_BIDI_PAIRED_BRACKET: 399 props.bpb=U_SENTINEL; 400 break; 401 case UCHAR_SIMPLE_CASE_FOLDING: 402 props.scf=U_SENTINEL; 403 break; 404 case UCHAR_SIMPLE_LOWERCASE_MAPPING: 405 props.slc=U_SENTINEL; 406 break; 407 case UCHAR_SIMPLE_TITLECASE_MAPPING: 408 props.stc=U_SENTINEL; 409 break; 410 case UCHAR_SIMPLE_UPPERCASE_MAPPING: 411 props.suc=U_SENTINEL; 412 break; 413 case UCHAR_CASE_FOLDING: 414 props.cf.remove(); 415 break; 416 case UCHAR_LOWERCASE_MAPPING: 417 props.lc.remove(); 418 break; 419 case UCHAR_TITLECASE_MAPPING: 420 props.tc.remove(); 421 break; 422 case UCHAR_UPPERCASE_MAPPING: 423 props.uc.remove(); 424 break; 425 case UCHAR_SCRIPT_EXTENSIONS: 426 props.scx.clear(); 427 break; 428 default: 429 fprintf(stderr, 430 "error in preparsed UCD: '%s' is not a valid default value on line %ld\n", 431 field, (long)lineNumber); 432 errorCode=U_PARSE_ERROR; 433 } 434 } else { 435 char c; 436 switch(prop) { 437 case UCHAR_NUMERIC_VALUE: 438 props.numericValue=v; 439 c=*v; 440 if('0'<=c && c<='9' && v[1]==0) { 441 props.digitValue=c-'0'; 442 } else { 443 props.digitValue=-1; 444 } 445 break; 446 case UCHAR_NAME: 447 props.name=v; 448 break; 449 case UCHAR_AGE: 450 u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric. 451 break; 452 case UCHAR_BIDI_MIRRORING_GLYPH: 453 props.bmg=parseCodePoint(v, errorCode); 454 break; 455 case UCHAR_BIDI_PAIRED_BRACKET: 456 props.bpb=parseCodePoint(v, errorCode); 457 break; 458 case UCHAR_SIMPLE_CASE_FOLDING: 459 props.scf=parseCodePoint(v, errorCode); 460 break; 461 case UCHAR_SIMPLE_LOWERCASE_MAPPING: 462 props.slc=parseCodePoint(v, errorCode); 463 break; 464 case UCHAR_SIMPLE_TITLECASE_MAPPING: 465 props.stc=parseCodePoint(v, errorCode); 466 break; 467 case UCHAR_SIMPLE_UPPERCASE_MAPPING: 468 props.suc=parseCodePoint(v, errorCode); 469 break; 470 case UCHAR_CASE_FOLDING: 471 parseString(v, props.cf, errorCode); 472 break; 473 case UCHAR_LOWERCASE_MAPPING: 474 parseString(v, props.lc, errorCode); 475 break; 476 case UCHAR_TITLECASE_MAPPING: 477 parseString(v, props.tc, errorCode); 478 break; 479 case UCHAR_UPPERCASE_MAPPING: 480 parseString(v, props.uc, errorCode); 481 break; 482 case PPUCD_NAME_ALIAS: 483 props.nameAlias=v; 484 break; 485 case PPUCD_CONDITIONAL_CASE_MAPPINGS: 486 case PPUCD_TURKIC_CASE_FOLDING: 487 // No need to parse their values: They are hardcoded in the runtime library. 488 break; 489 case UCHAR_SCRIPT_EXTENSIONS: 490 parseScriptExtensions(v, props.scx, errorCode); 491 break; 492 default: 493 // Ignore unhandled properties. 494 return TRUE; 495 } 496 } 497 if(U_SUCCESS(errorCode)) { 498 newValues.add((UChar32)prop); 499 return TRUE; 500 } else { 501 return FALSE; 502 } 503 } 504 505 UBool 506 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) { 507 if(U_FAILURE(errorCode)) { return FALSE; } 508 if(lineType!=ALG_NAMES_RANGE_LINE) { 509 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 510 return FALSE; 511 } 512 firstField(); 513 const char *field=nextField(); 514 if(field==NULL) { 515 // No range field after the type. 516 fprintf(stderr, 517 "error in preparsed UCD: missing algnamesrange range field " 518 "(no second field) on line %ld\n", 519 (long)lineNumber); 520 errorCode=U_PARSE_ERROR; 521 return FALSE; 522 } 523 return parseCodePointRange(field, start, end, errorCode); 524 } 525 526 UChar32 527 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) { 528 char *end; 529 uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16); 530 if(end<=s || *end!=0 || value>=0x110000) { 531 fprintf(stderr, 532 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n", 533 s, (long)lineNumber); 534 errorCode=U_PARSE_ERROR; 535 return U_SENTINEL; 536 } 537 return (UChar32)value; 538 } 539 540 UBool 541 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) { 542 uint32_t st, e; 543 u_parseCodePointRange(s, &st, &e, &errorCode); 544 if(U_FAILURE(errorCode)) { 545 fprintf(stderr, 546 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n", 547 s, (long)lineNumber); 548 return FALSE; 549 } 550 start=(UChar32)st; 551 end=(UChar32)e; 552 return TRUE; 553 } 554 555 void 556 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) { 557 UChar *buffer=toUCharPtr(uni.getBuffer(-1)); 558 int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode); 559 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 560 errorCode=U_ZERO_ERROR; 561 uni.releaseBuffer(0); 562 buffer=toUCharPtr(uni.getBuffer(length)); 563 length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode); 564 } 565 uni.releaseBuffer(length); 566 if(U_FAILURE(errorCode)) { 567 fprintf(stderr, 568 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n", 569 s, (long)lineNumber); 570 } 571 } 572 573 void 574 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) { 575 if(U_FAILURE(errorCode)) { return; } 576 scx.clear(); 577 CharString scString; 578 for(;;) { 579 const char *scs; 580 const char *scLimit=strchr(s, ' '); 581 if(scLimit!=NULL) { 582 scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data(); 583 if(U_FAILURE(errorCode)) { return; } 584 } else { 585 scs=s; 586 } 587 int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs); 588 if(script==UCHAR_INVALID_CODE) { 589 fprintf(stderr, 590 "error in preparsed UCD: '%s' is not a valid script code on line %ld\n", 591 scs, (long)lineNumber); 592 errorCode=U_PARSE_ERROR; 593 return; 594 } else if(scx.contains(script)) { 595 fprintf(stderr, 596 "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n", 597 scs, (long)lineNumber); 598 errorCode=U_PARSE_ERROR; 599 return; 600 } else { 601 scx.add(script); 602 } 603 if(scLimit!=NULL) { 604 s=scLimit+1; 605 } else { 606 break; 607 } 608 } 609 if(scx.isEmpty()) { 610 fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber); 611 errorCode=U_PARSE_ERROR; 612 } 613 } 614 615 U_NAMESPACE_END 616