1 /* 2 ******************************************************************************* 3 * Copyright (C) 2011-2013, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ppucd.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2011dec11 12 * created by: Markus W. Scherer 13 */ 14 15 #include "unicode/utypes.h" 16 #include "unicode/uchar.h" 17 #include "charstr.h" 18 #include "cstring.h" 19 #include "ppucd.h" 20 #include "uassert.h" 21 #include "uparse.h" 22 23 #include <stdio.h> 24 #include <string.h> 25 26 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 27 28 U_NAMESPACE_BEGIN 29 30 PropertyNames::~PropertyNames() {} 31 32 int32_t 33 PropertyNames::getPropertyEnum(const char *name) const { 34 return u_getPropertyEnum(name); 35 } 36 37 int32_t 38 PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const { 39 return u_getPropertyValueEnum((UProperty)property, name); 40 } 41 42 UniProps::UniProps() 43 : start(U_SENTINEL), end(U_SENTINEL), 44 bmg(U_SENTINEL), bpb(U_SENTINEL), 45 scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL), 46 digitValue(-1), numericValue(NULL), 47 name(NULL), nameAlias(NULL) { 48 memset(binProps, 0, sizeof(binProps)); 49 memset(intProps, 0, sizeof(intProps)); 50 memset(age, 0, 4); 51 } 52 53 UniProps::~UniProps() {} 54 55 const int32_t PreparsedUCD::kNumLineBuffers; 56 57 PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode) 58 : icuPnames(new PropertyNames()), pnames(icuPnames), 59 file(NULL), 60 defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0), 61 lineNumber(0), 62 lineType(NO_LINE), 63 fieldLimit(NULL), lineLimit(NULL) { 64 if(U_FAILURE(errorCode)) { return; } 65 66 if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) { 67 filename=NULL; 68 file=stdin; 69 } else { 70 file=fopen(filename, "r"); 71 } 72 if(file==NULL) { 73 perror("error opening preparsed UCD"); 74 fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\""); 75 errorCode=U_FILE_ACCESS_ERROR; 76 return; 77 } 78 79 memset(ucdVersion, 0, 4); 80 lines[0][0]=0; 81 } 82 83 PreparsedUCD::~PreparsedUCD() { 84 if(file!=stdin) { 85 fclose(file); 86 } 87 delete icuPnames; 88 } 89 90 // Same order as the LineType values. 91 static const char *lineTypeStrings[]={ 92 NULL, 93 NULL, 94 "ucd", 95 "property", 96 "binary", 97 "value", 98 "defaults", 99 "block", 100 "cp", 101 "algnamesrange" 102 }; 103 104 PreparsedUCD::LineType 105 PreparsedUCD::readLine(UErrorCode &errorCode) { 106 if(U_FAILURE(errorCode)) { return NO_LINE; } 107 // Select the next available line buffer. 108 while(!isLineBufferAvailable(lineIndex)) { 109 ++lineIndex; 110 if (lineIndex == kNumLineBuffers) { 111 lineIndex = 0; 112 } 113 } 114 char *line=lines[lineIndex]; 115 *line=0; 116 lineLimit=fieldLimit=line; 117 lineType=NO_LINE; 118 char *result=fgets(line, sizeof(lines[0]), file); 119 if(result==NULL) { 120 if(ferror(file)) { 121 perror("error reading preparsed UCD"); 122 fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber); 123 errorCode=U_FILE_ACCESS_ERROR; 124 } 125 return NO_LINE; 126 } 127 ++lineNumber; 128 if(*line=='#') { 129 fieldLimit=strchr(line, 0); 130 return lineType=EMPTY_LINE; 131 } 132 // Remove trailing /r/n. 133 char c; 134 char *limit=strchr(line, 0); 135 while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; } 136 // Remove trailing white space. 137 while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; } 138 *limit=0; 139 lineLimit=limit; 140 if(line==limit) { 141 fieldLimit=limit; 142 return lineType=EMPTY_LINE; 143 } 144 // Split by ';'. 145 char *semi=line; 146 while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; } 147 fieldLimit=strchr(line, 0); 148 // Determine the line type. 149 int32_t type; 150 for(type=EMPTY_LINE+1;; ++type) { 151 if(type==LINE_TYPE_COUNT) { 152 fprintf(stderr, 153 "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n", 154 line, (long)lineNumber); 155 errorCode=U_PARSE_ERROR; 156 return NO_LINE; 157 } 158 if(0==strcmp(line, lineTypeStrings[type])) { 159 break; 160 } 161 } 162 lineType=(LineType)type; 163 if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) { 164 u_versionFromString(ucdVersion, fieldLimit+1); 165 } 166 return lineType; 167 } 168 169 const char * 170 PreparsedUCD::firstField() { 171 char *field=lines[lineIndex]; 172 fieldLimit=strchr(field, 0); 173 return field; 174 } 175 176 const char * 177 PreparsedUCD::nextField() { 178 if(fieldLimit==lineLimit) { return NULL; } 179 char *field=fieldLimit+1; 180 fieldLimit=strchr(field, 0); 181 return field; 182 } 183 184 const UniProps * 185 PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) { 186 if(U_FAILURE(errorCode)) { return NULL; } 187 newValues.clear(); 188 if(!lineHasPropertyValues()) { 189 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 190 return NULL; 191 } 192 firstField(); 193 const char *field=nextField(); 194 if(field==NULL) { 195 // No range field after the type. 196 fprintf(stderr, 197 "error in preparsed UCD: missing default/block/cp range field " 198 "(no second field) on line %ld\n", 199 (long)lineNumber); 200 errorCode=U_PARSE_ERROR; 201 return NULL; 202 } 203 UChar32 start, end; 204 if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; } 205 UniProps *props; 206 switch(lineType) { 207 case DEFAULTS_LINE: 208 if(defaultLineIndex>=0) { 209 fprintf(stderr, 210 "error in preparsed UCD: second line with default properties on line %ld\n", 211 (long)lineNumber); 212 errorCode=U_PARSE_ERROR; 213 return NULL; 214 } 215 if(start!=0 || end!=0x10ffff) { 216 fprintf(stderr, 217 "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n", 218 field, (long)lineNumber); 219 errorCode=U_PARSE_ERROR; 220 return NULL; 221 } 222 props=&defaultProps; 223 defaultLineIndex=lineIndex; 224 break; 225 case BLOCK_LINE: 226 blockProps=defaultProps; // Block inherits default properties. 227 props=&blockProps; 228 blockLineIndex=lineIndex; 229 break; 230 case CP_LINE: 231 if(blockProps.start<=start && end<=blockProps.end) { 232 // Code point range fully inside the last block inherits the block properties. 233 cpProps=blockProps; 234 } else if(start>blockProps.end || end<blockProps.start) { 235 // Code point range fully outside the last block inherits the default properties. 236 cpProps=defaultProps; 237 } else { 238 // Code point range partially overlapping with the last block is illegal. 239 fprintf(stderr, 240 "error in preparsed UCD: cp range %s on line %ld only " 241 "partially overlaps with block range %04lX..%04lX\n", 242 field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end); 243 errorCode=U_PARSE_ERROR; 244 return NULL; 245 } 246 props=&cpProps; 247 break; 248 default: 249 // Will not occur because of the range check above. 250 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 251 return NULL; 252 } 253 props->start=start; 254 props->end=end; 255 while((field=nextField())!=NULL) { 256 if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; } 257 } 258 return props; 259 } 260 261 static const struct { 262 const char *name; 263 int32_t prop; 264 } ppucdProperties[]={ 265 { "Name_Alias", PPUCD_NAME_ALIAS }, 266 { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS }, 267 { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING } 268 }; 269 270 // Returns TRUE for "ok to continue parsing fields". 271 UBool 272 PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, 273 UErrorCode &errorCode) { 274 CharString pBuffer; 275 const char *p=field; 276 const char *v=strchr(p, '='); 277 int binaryValue; 278 if(*p=='-') { 279 if(v!=NULL) { 280 fprintf(stderr, 281 "error in preparsed UCD: mix of binary-property-no and " 282 "enum-property syntax '%s' on line %ld\n", 283 field, (long)lineNumber); 284 errorCode=U_PARSE_ERROR; 285 return FALSE; 286 } 287 binaryValue=0; 288 ++p; 289 } else if(v==NULL) { 290 binaryValue=1; 291 } else { 292 binaryValue=-1; 293 // Copy out the property name rather than modifying the field (writing a NUL). 294 pBuffer.append(p, (int32_t)(v-p), errorCode); 295 p=pBuffer.data(); 296 ++v; 297 } 298 int32_t prop=pnames->getPropertyEnum(p); 299 if(prop<0) { 300 for(int32_t i=0;; ++i) { 301 if(i==LENGTHOF(ppucdProperties)) { 302 // Ignore unknown property names. 303 return TRUE; 304 } 305 if(0==uprv_stricmp(p, ppucdProperties[i].name)) { 306 prop=ppucdProperties[i].prop; 307 U_ASSERT(prop>=0); 308 break; 309 } 310 } 311 } 312 if(prop<UCHAR_BINARY_LIMIT) { 313 if(binaryValue>=0) { 314 props.binProps[prop]=(UBool)binaryValue; 315 } else { 316 // No binary value for a binary property. 317 fprintf(stderr, 318 "error in preparsed UCD: enum-property syntax '%s' " 319 "for binary property on line %ld\n", 320 field, (long)lineNumber); 321 errorCode=U_PARSE_ERROR; 322 } 323 } else if(binaryValue>=0) { 324 // Binary value for a non-binary property. 325 fprintf(stderr, 326 "error in preparsed UCD: binary-property syntax '%s' " 327 "for non-binary property on line %ld\n", 328 field, (long)lineNumber); 329 errorCode=U_PARSE_ERROR; 330 } else if (prop < UCHAR_INT_START) { 331 fprintf(stderr, 332 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n", 333 prop, (long)lineNumber); 334 errorCode=U_PARSE_ERROR; 335 } else if(prop<UCHAR_INT_LIMIT) { 336 int32_t value=pnames->getPropertyValueEnum(prop, v); 337 if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) { 338 // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work. 339 char *end; 340 unsigned long ccc=uprv_strtoul(v, &end, 10); 341 if(v<end && *end==0 && ccc<=254) { 342 value=(int32_t)ccc; 343 } 344 } 345 if(value==UCHAR_INVALID_CODE) { 346 fprintf(stderr, 347 "error in preparsed UCD: '%s' is not a valid value on line %ld\n", 348 field, (long)lineNumber); 349 errorCode=U_PARSE_ERROR; 350 } else { 351 props.intProps[prop-UCHAR_INT_START]=value; 352 } 353 } else if(*v=='<') { 354 // Do not parse default values like <code point>, just set null values. 355 switch(prop) { 356 case UCHAR_BIDI_MIRRORING_GLYPH: 357 props.bmg=U_SENTINEL; 358 break; 359 case UCHAR_BIDI_PAIRED_BRACKET: 360 props.bpb=U_SENTINEL; 361 break; 362 case UCHAR_SIMPLE_CASE_FOLDING: 363 props.scf=U_SENTINEL; 364 break; 365 case UCHAR_SIMPLE_LOWERCASE_MAPPING: 366 props.slc=U_SENTINEL; 367 break; 368 case UCHAR_SIMPLE_TITLECASE_MAPPING: 369 props.stc=U_SENTINEL; 370 break; 371 case UCHAR_SIMPLE_UPPERCASE_MAPPING: 372 props.suc=U_SENTINEL; 373 break; 374 case UCHAR_CASE_FOLDING: 375 props.cf.remove(); 376 break; 377 case UCHAR_LOWERCASE_MAPPING: 378 props.lc.remove(); 379 break; 380 case UCHAR_TITLECASE_MAPPING: 381 props.tc.remove(); 382 break; 383 case UCHAR_UPPERCASE_MAPPING: 384 props.uc.remove(); 385 break; 386 case UCHAR_SCRIPT_EXTENSIONS: 387 props.scx.clear(); 388 break; 389 default: 390 fprintf(stderr, 391 "error in preparsed UCD: '%s' is not a valid default value on line %ld\n", 392 field, (long)lineNumber); 393 errorCode=U_PARSE_ERROR; 394 } 395 } else { 396 char c; 397 switch(prop) { 398 case UCHAR_NUMERIC_VALUE: 399 props.numericValue=v; 400 c=*v; 401 if('0'<=c && c<='9' && v[1]==0) { 402 props.digitValue=c-'0'; 403 } else { 404 props.digitValue=-1; 405 } 406 break; 407 case UCHAR_NAME: 408 props.name=v; 409 break; 410 case UCHAR_AGE: 411 u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric. 412 break; 413 case UCHAR_BIDI_MIRRORING_GLYPH: 414 props.bmg=parseCodePoint(v, errorCode); 415 break; 416 case UCHAR_BIDI_PAIRED_BRACKET: 417 props.bpb=parseCodePoint(v, errorCode); 418 break; 419 case UCHAR_SIMPLE_CASE_FOLDING: 420 props.scf=parseCodePoint(v, errorCode); 421 break; 422 case UCHAR_SIMPLE_LOWERCASE_MAPPING: 423 props.slc=parseCodePoint(v, errorCode); 424 break; 425 case UCHAR_SIMPLE_TITLECASE_MAPPING: 426 props.stc=parseCodePoint(v, errorCode); 427 break; 428 case UCHAR_SIMPLE_UPPERCASE_MAPPING: 429 props.suc=parseCodePoint(v, errorCode); 430 break; 431 case UCHAR_CASE_FOLDING: 432 parseString(v, props.cf, errorCode); 433 break; 434 case UCHAR_LOWERCASE_MAPPING: 435 parseString(v, props.lc, errorCode); 436 break; 437 case UCHAR_TITLECASE_MAPPING: 438 parseString(v, props.tc, errorCode); 439 break; 440 case UCHAR_UPPERCASE_MAPPING: 441 parseString(v, props.uc, errorCode); 442 break; 443 case PPUCD_NAME_ALIAS: 444 props.nameAlias=v; 445 break; 446 case PPUCD_CONDITIONAL_CASE_MAPPINGS: 447 case PPUCD_TURKIC_CASE_FOLDING: 448 // No need to parse their values: They are hardcoded in the runtime library. 449 break; 450 case UCHAR_SCRIPT_EXTENSIONS: 451 parseScriptExtensions(v, props.scx, errorCode); 452 break; 453 default: 454 // Ignore unhandled properties. 455 return TRUE; 456 } 457 } 458 if(U_SUCCESS(errorCode)) { 459 newValues.add((UChar32)prop); 460 return TRUE; 461 } else { 462 return FALSE; 463 } 464 } 465 466 UBool 467 PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) { 468 if(U_FAILURE(errorCode)) { return FALSE; } 469 if(lineType!=ALG_NAMES_RANGE_LINE) { 470 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 471 return FALSE; 472 } 473 firstField(); 474 const char *field=nextField(); 475 if(field==NULL) { 476 // No range field after the type. 477 fprintf(stderr, 478 "error in preparsed UCD: missing algnamesrange range field " 479 "(no second field) on line %ld\n", 480 (long)lineNumber); 481 errorCode=U_PARSE_ERROR; 482 return FALSE; 483 } 484 return parseCodePointRange(field, start, end, errorCode); 485 } 486 487 UChar32 488 PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) { 489 char *end; 490 uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16); 491 if(end<=s || *end!=0 || value>=0x110000) { 492 fprintf(stderr, 493 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n", 494 s, (long)lineNumber); 495 errorCode=U_PARSE_ERROR; 496 return U_SENTINEL; 497 } 498 return (UChar32)value; 499 } 500 501 UBool 502 PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) { 503 uint32_t st, e; 504 u_parseCodePointRange(s, &st, &e, &errorCode); 505 if(U_FAILURE(errorCode)) { 506 fprintf(stderr, 507 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n", 508 s, (long)lineNumber); 509 return FALSE; 510 } 511 start=(UChar32)st; 512 end=(UChar32)e; 513 return TRUE; 514 } 515 516 void 517 PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) { 518 UChar *buffer=uni.getBuffer(-1); 519 int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode); 520 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 521 errorCode=U_ZERO_ERROR; 522 uni.releaseBuffer(0); 523 buffer=uni.getBuffer(length); 524 length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode); 525 } 526 uni.releaseBuffer(length); 527 if(U_FAILURE(errorCode)) { 528 fprintf(stderr, 529 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n", 530 s, (long)lineNumber); 531 } 532 } 533 534 void 535 PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) { 536 if(U_FAILURE(errorCode)) { return; } 537 scx.clear(); 538 CharString scString; 539 for(;;) { 540 const char *scs; 541 const char *scLimit=strchr(s, ' '); 542 if(scLimit!=NULL) { 543 scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data(); 544 if(U_FAILURE(errorCode)) { return; } 545 } else { 546 scs=s; 547 } 548 int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs); 549 if(script==UCHAR_INVALID_CODE) { 550 fprintf(stderr, 551 "error in preparsed UCD: '%s' is not a valid script code on line %ld\n", 552 scs, (long)lineNumber); 553 errorCode=U_PARSE_ERROR; 554 return; 555 } else if(scx.contains(script)) { 556 fprintf(stderr, 557 "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n", 558 scs, (long)lineNumber); 559 errorCode=U_PARSE_ERROR; 560 return; 561 } else { 562 scx.add(script); 563 } 564 if(scLimit!=NULL) { 565 s=scLimit+1; 566 } else { 567 break; 568 } 569 } 570 if(scx.isEmpty()) { 571 fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber); 572 errorCode=U_PARSE_ERROR; 573 } 574 } 575 576 U_NAMESPACE_END 577