1 /* 2 ******************************************************************************* 3 * Copyright (C) 2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: messagepattern.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2011mar14 12 * created by: Markus W. Scherer 13 */ 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_FORMATTING 18 19 #include "unicode/messagepattern.h" 20 #include "unicode/unistr.h" 21 #include "cmemory.h" 22 #include "cstring.h" 23 #include "messageimpl.h" 24 #include "patternprops.h" 25 #include "putilimp.h" 26 #include "uassert.h" 27 28 U_NAMESPACE_BEGIN 29 30 // Unicode character/code point constants ---------------------------------- *** 31 32 static const UChar u_pound=0x23; 33 static const UChar u_apos=0x27; 34 static const UChar u_plus=0x2B; 35 static const UChar u_comma=0x2C; 36 static const UChar u_minus=0x2D; 37 static const UChar u_dot=0x2E; 38 static const UChar u_colon=0x3A; 39 static const UChar u_lessThan=0x3C; 40 static const UChar u_equal=0x3D; 41 static const UChar u_A=0x41; 42 static const UChar u_C=0x43; 43 static const UChar u_E=0x45; 44 static const UChar u_H=0x48; 45 static const UChar u_I=0x49; 46 static const UChar u_L=0x4C; 47 static const UChar u_O=0x4F; 48 static const UChar u_P=0x50; 49 static const UChar u_R=0x52; 50 static const UChar u_S=0x53; 51 static const UChar u_T=0x54; 52 static const UChar u_U=0x55; 53 static const UChar u_Z=0x5A; 54 static const UChar u_a=0x61; 55 static const UChar u_c=0x63; 56 static const UChar u_e=0x65; 57 static const UChar u_f=0x66; 58 static const UChar u_h=0x68; 59 static const UChar u_i=0x69; 60 static const UChar u_l=0x6C; 61 static const UChar u_o=0x6F; 62 static const UChar u_p=0x70; 63 static const UChar u_r=0x72; 64 static const UChar u_s=0x73; 65 static const UChar u_t=0x74; 66 static const UChar u_u=0x75; 67 static const UChar u_z=0x7A; 68 static const UChar u_leftCurlyBrace=0x7B; 69 static const UChar u_pipe=0x7C; 70 static const UChar u_rightCurlyBrace=0x7D; 71 static const UChar u_lessOrEqual=0x2264; // U+2264 is <= 72 73 static const UChar kOffsetColon[]={ // "offset:" 74 u_o, u_f, u_f, u_s, u_e, u_t, u_colon 75 }; 76 77 static const UChar kOther[]={ // "other" 78 u_o, u_t, u_h, u_e, u_r 79 }; 80 81 // MessagePatternList ------------------------------------------------------ *** 82 83 template<typename T, int32_t stackCapacity> 84 class MessagePatternList : public UMemory { 85 public: 86 MessagePatternList() {} 87 void copyFrom(const MessagePatternList<T, stackCapacity> &other, 88 int32_t length, 89 UErrorCode &errorCode); 90 UBool ensureCapacityForOneMore(int32_t oldLength, UErrorCode &errorCode); 91 UBool memEquals(const MessagePatternList<T, stackCapacity> &other, int32_t length) const { 92 return 0==uprv_memcmp(a.getAlias(), other.a.getAlias(), length*sizeof(T)); 93 } 94 95 MaybeStackArray<T, stackCapacity> a; 96 }; 97 98 template<typename T, int32_t stackCapacity> 99 void 100 MessagePatternList<T, stackCapacity>::copyFrom( 101 const MessagePatternList<T, stackCapacity> &other, 102 int32_t length, 103 UErrorCode &errorCode) { 104 if(U_SUCCESS(errorCode) && length>0) { 105 if(length>a.getCapacity() && NULL==a.resize(length)) { 106 errorCode=U_MEMORY_ALLOCATION_ERROR; 107 return; 108 } 109 uprv_memcpy(a.getAlias(), other.a.getAlias(), length*sizeof(T)); 110 } 111 } 112 113 template<typename T, int32_t stackCapacity> 114 UBool 115 MessagePatternList<T, stackCapacity>::ensureCapacityForOneMore(int32_t oldLength, UErrorCode &errorCode) { 116 if(U_FAILURE(errorCode)) { 117 return FALSE; 118 } 119 if(a.getCapacity()>oldLength || a.resize(2*oldLength, oldLength)!=NULL) { 120 return TRUE; 121 } 122 errorCode=U_MEMORY_ALLOCATION_ERROR; 123 return FALSE; 124 } 125 126 // MessagePatternList specializations -------------------------------------- *** 127 128 class MessagePatternDoubleList : public MessagePatternList<double, 8> { 129 }; 130 131 class MessagePatternPartsList : public MessagePatternList<MessagePattern::Part, 32> { 132 }; 133 134 // MessagePattern constructors etc. ---------------------------------------- *** 135 136 MessagePattern::MessagePattern(UErrorCode &errorCode) 137 : aposMode(UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE), 138 partsList(NULL), parts(NULL), partsLength(0), 139 numericValuesList(NULL), numericValues(NULL), numericValuesLength(0), 140 hasArgNames(FALSE), hasArgNumbers(FALSE), needsAutoQuoting(FALSE) { 141 init(errorCode); 142 } 143 144 MessagePattern::MessagePattern(UMessagePatternApostropheMode mode, UErrorCode &errorCode) 145 : aposMode(mode), 146 partsList(NULL), parts(NULL), partsLength(0), 147 numericValuesList(NULL), numericValues(NULL), numericValuesLength(0), 148 hasArgNames(FALSE), hasArgNumbers(FALSE), needsAutoQuoting(FALSE) { 149 init(errorCode); 150 } 151 152 MessagePattern::MessagePattern(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode) 153 : aposMode(UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE), 154 partsList(NULL), parts(NULL), partsLength(0), 155 numericValuesList(NULL), numericValues(NULL), numericValuesLength(0), 156 hasArgNames(FALSE), hasArgNumbers(FALSE), needsAutoQuoting(FALSE) { 157 if(init(errorCode)) { 158 parse(pattern, parseError, errorCode); 159 } 160 } 161 162 UBool 163 MessagePattern::init(UErrorCode &errorCode) { 164 if(U_FAILURE(errorCode)) { 165 return FALSE; 166 } 167 partsList=new MessagePatternPartsList(); 168 if(partsList==NULL) { 169 errorCode=U_MEMORY_ALLOCATION_ERROR; 170 return FALSE; 171 } 172 parts=partsList->a.getAlias(); 173 return TRUE; 174 } 175 176 MessagePattern::MessagePattern(const MessagePattern &other) 177 : aposMode(other.aposMode), msg(other.msg), 178 partsList(NULL), parts(NULL), partsLength(0), 179 numericValuesList(NULL), numericValues(NULL), numericValuesLength(0), 180 hasArgNames(other.hasArgNames), hasArgNumbers(other.hasArgNumbers), 181 needsAutoQuoting(other.needsAutoQuoting) { 182 UErrorCode errorCode=U_ZERO_ERROR; 183 if(!copyStorage(other, errorCode)) { 184 clear(); 185 } 186 } 187 188 MessagePattern & 189 MessagePattern::operator=(const MessagePattern &other) { 190 if(this==&other) { 191 return *this; 192 } 193 aposMode=other.aposMode; 194 msg=other.msg; 195 hasArgNames=other.hasArgNames; 196 hasArgNumbers=other.hasArgNumbers; 197 needsAutoQuoting=other.needsAutoQuoting; 198 UErrorCode errorCode=U_ZERO_ERROR; 199 if(!copyStorage(other, errorCode)) { 200 clear(); 201 } 202 return *this; 203 } 204 205 UBool 206 MessagePattern::copyStorage(const MessagePattern &other, UErrorCode &errorCode) { 207 if(U_FAILURE(errorCode)) { 208 return FALSE; 209 } 210 parts=NULL; 211 partsLength=0; 212 numericValues=NULL; 213 numericValuesLength=0; 214 if(partsList==NULL) { 215 partsList=new MessagePatternPartsList(); 216 if(partsList==NULL) { 217 errorCode=U_MEMORY_ALLOCATION_ERROR; 218 return FALSE; 219 } 220 parts=partsList->a.getAlias(); 221 } 222 if(other.partsLength>0) { 223 partsList->copyFrom(*other.partsList, other.partsLength, errorCode); 224 if(U_FAILURE(errorCode)) { 225 return FALSE; 226 } 227 parts=partsList->a.getAlias(); 228 partsLength=other.partsLength; 229 } 230 if(other.numericValuesLength>0) { 231 if(numericValuesList==NULL) { 232 numericValuesList=new MessagePatternDoubleList(); 233 if(numericValuesList==NULL) { 234 errorCode=U_MEMORY_ALLOCATION_ERROR; 235 return FALSE; 236 } 237 numericValues=numericValuesList->a.getAlias(); 238 } 239 numericValuesList->copyFrom( 240 *other.numericValuesList, other.numericValuesLength, errorCode); 241 if(U_FAILURE(errorCode)) { 242 return FALSE; 243 } 244 numericValues=numericValuesList->a.getAlias(); 245 numericValuesLength=other.numericValuesLength; 246 } 247 return TRUE; 248 } 249 250 MessagePattern::~MessagePattern() { 251 delete partsList; 252 delete numericValuesList; 253 } 254 255 // MessagePattern API ------------------------------------------------------ *** 256 257 MessagePattern & 258 MessagePattern::parse(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode) { 259 preParse(pattern, parseError, errorCode); 260 parseMessage(0, 0, 0, UMSGPAT_ARG_TYPE_NONE, parseError, errorCode); 261 postParse(); 262 return *this; 263 } 264 265 MessagePattern & 266 MessagePattern::parseChoiceStyle(const UnicodeString &pattern, 267 UParseError *parseError, UErrorCode &errorCode) { 268 preParse(pattern, parseError, errorCode); 269 parseChoiceStyle(0, 0, parseError, errorCode); 270 postParse(); 271 return *this; 272 } 273 274 MessagePattern & 275 MessagePattern::parsePluralStyle(const UnicodeString &pattern, 276 UParseError *parseError, UErrorCode &errorCode) { 277 preParse(pattern, parseError, errorCode); 278 parsePluralOrSelectStyle(UMSGPAT_ARG_TYPE_PLURAL, 0, 0, parseError, errorCode); 279 postParse(); 280 return *this; 281 } 282 283 MessagePattern & 284 MessagePattern::parseSelectStyle(const UnicodeString &pattern, 285 UParseError *parseError, UErrorCode &errorCode) { 286 preParse(pattern, parseError, errorCode); 287 parsePluralOrSelectStyle(UMSGPAT_ARG_TYPE_SELECT, 0, 0, parseError, errorCode); 288 postParse(); 289 return *this; 290 } 291 292 void 293 MessagePattern::clear() { 294 // Mostly the same as preParse(). 295 msg.remove(); 296 hasArgNames=hasArgNumbers=FALSE; 297 needsAutoQuoting=FALSE; 298 partsLength=0; 299 numericValuesLength=0; 300 } 301 302 UBool 303 MessagePattern::operator==(const MessagePattern &other) const { 304 if(this==&other) { 305 return TRUE; 306 } 307 return 308 aposMode==other.aposMode && 309 msg==other.msg && 310 // parts.equals(o.parts) 311 partsLength==other.partsLength && 312 (partsLength==0 || partsList->memEquals(*other.partsList, partsLength)); 313 // No need to compare numericValues if msg and parts are the same. 314 } 315 316 int32_t 317 MessagePattern::hashCode() const { 318 int32_t hash=(aposMode*37+msg.hashCode())*37+partsLength; 319 for(int32_t i=0; i<partsLength; ++i) { 320 hash=hash*37+parts[i].hashCode(); 321 } 322 return hash; 323 } 324 325 int32_t 326 MessagePattern::validateArgumentName(const UnicodeString &name) { 327 if(!PatternProps::isIdentifier(name.getBuffer(), name.length())) { 328 return UMSGPAT_ARG_NAME_NOT_VALID; 329 } 330 return parseArgNumber(name, 0, name.length()); 331 } 332 333 UnicodeString 334 MessagePattern::autoQuoteApostropheDeep() const { 335 if(!needsAutoQuoting) { 336 return msg; 337 } 338 UnicodeString modified(msg); 339 // Iterate backward so that the insertion indexes do not change. 340 int32_t count=countParts(); 341 for(int32_t i=count; i>0;) { 342 const Part &part=getPart(--i); 343 if(part.getType()==UMSGPAT_PART_TYPE_INSERT_CHAR) { 344 modified.insert(part.index, (UChar)part.value); 345 } 346 } 347 return modified; 348 } 349 350 double 351 MessagePattern::getNumericValue(const Part &part) const { 352 UMessagePatternPartType type=part.type; 353 if(type==UMSGPAT_PART_TYPE_ARG_INT) { 354 return part.value; 355 } else if(type==UMSGPAT_PART_TYPE_ARG_DOUBLE) { 356 return numericValues[part.value]; 357 } else { 358 return UMSGPAT_NO_NUMERIC_VALUE; 359 } 360 } 361 362 /** 363 * Returns the "offset:" value of a PluralFormat argument, or 0 if none is specified. 364 * @param pluralStart the index of the first PluralFormat argument style part. (0..countParts()-1) 365 * @return the "offset:" value. 366 * @draft ICU 4.8 367 */ 368 double 369 MessagePattern::getPluralOffset(int32_t pluralStart) const { 370 const Part &part=getPart(pluralStart); 371 if(Part::hasNumericValue(part.type)) { 372 return getNumericValue(part); 373 } else { 374 return 0; 375 } 376 } 377 378 // MessagePattern::Part ---------------------------------------------------- *** 379 380 UBool 381 MessagePattern::Part::operator==(const Part &other) const { 382 if(this==&other) { 383 return TRUE; 384 } 385 return 386 type==other.type && 387 index==other.index && 388 length==other.length && 389 value==other.value && 390 limitPartIndex==other.limitPartIndex; 391 } 392 393 // MessagePattern parser --------------------------------------------------- *** 394 395 void 396 MessagePattern::preParse(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode) { 397 if(U_FAILURE(errorCode)) { 398 return; 399 } 400 if(parseError!=NULL) { 401 parseError->line=0; 402 parseError->offset=0; 403 parseError->preContext[0]=0; 404 parseError->postContext[0]=0; 405 } 406 msg=pattern; 407 hasArgNames=hasArgNumbers=FALSE; 408 needsAutoQuoting=FALSE; 409 partsLength=0; 410 numericValuesLength=0; 411 } 412 413 void 414 MessagePattern::postParse() { 415 if(partsList!=NULL) { 416 parts=partsList->a.getAlias(); 417 } 418 if(numericValuesList!=NULL) { 419 numericValues=numericValuesList->a.getAlias(); 420 } 421 } 422 423 int32_t 424 MessagePattern::parseMessage(int32_t index, int32_t msgStartLength, 425 int32_t nestingLevel, UMessagePatternArgType parentType, 426 UParseError *parseError, UErrorCode &errorCode) { 427 if(U_FAILURE(errorCode)) { 428 return 0; 429 } 430 if(nestingLevel>Part::MAX_VALUE) { 431 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 432 return 0; 433 } 434 int32_t msgStart=partsLength; 435 addPart(UMSGPAT_PART_TYPE_MSG_START, index, msgStartLength, nestingLevel, errorCode); 436 index+=msgStartLength; 437 for(;;) { // while(index<msg.length()) with U_FAILURE(errorCode) check 438 if(U_FAILURE(errorCode)) { 439 return 0; 440 } 441 if(index>=msg.length()) { 442 break; 443 } 444 UChar c=msg.charAt(index++); 445 if(c==u_apos) { 446 if(index==msg.length()) { 447 // The apostrophe is the last character in the pattern. 448 // Add a Part for auto-quoting. 449 addPart(UMSGPAT_PART_TYPE_INSERT_CHAR, index, 0, 450 u_apos, errorCode); // value=char to be inserted 451 needsAutoQuoting=TRUE; 452 } else { 453 c=msg.charAt(index); 454 if(c==u_apos) { 455 // double apostrophe, skip the second one 456 addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, index++, 1, 0, errorCode); 457 } else if( 458 aposMode==UMSGPAT_APOS_DOUBLE_REQUIRED || 459 c==u_leftCurlyBrace || c==u_rightCurlyBrace || 460 (parentType==UMSGPAT_ARG_TYPE_CHOICE && c==u_pipe) || 461 (parentType==UMSGPAT_ARG_TYPE_PLURAL && c==u_pound) 462 ) { 463 // skip the quote-starting apostrophe 464 addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, index-1, 1, 0, errorCode); 465 // find the end of the quoted literal text 466 for(;;) { 467 index=msg.indexOf(u_apos, index+1); 468 if(index>=0) { 469 if(/*(index+1)<msg.length() &&*/ msg.charAt(index+1)==u_apos) { 470 // double apostrophe inside quoted literal text 471 // still encodes a single apostrophe, skip the second one 472 addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, ++index, 1, 0, errorCode); 473 } else { 474 // skip the quote-ending apostrophe 475 addPart(UMSGPAT_PART_TYPE_SKIP_SYNTAX, index++, 1, 0, errorCode); 476 break; 477 } 478 } else { 479 // The quoted text reaches to the end of the of the message. 480 index=msg.length(); 481 // Add a Part for auto-quoting. 482 addPart(UMSGPAT_PART_TYPE_INSERT_CHAR, index, 0, 483 u_apos, errorCode); // value=char to be inserted 484 needsAutoQuoting=TRUE; 485 break; 486 } 487 } 488 } else { 489 // Interpret the apostrophe as literal text. 490 // Add a Part for auto-quoting. 491 addPart(UMSGPAT_PART_TYPE_INSERT_CHAR, index, 0, 492 u_apos, errorCode); // value=char to be inserted 493 needsAutoQuoting=TRUE; 494 } 495 } 496 } else if(parentType==UMSGPAT_ARG_TYPE_PLURAL && c==u_pound) { 497 // The unquoted # in a plural message fragment will be replaced 498 // with the (number-offset). 499 addPart(UMSGPAT_PART_TYPE_REPLACE_NUMBER, index-1, 1, 0, errorCode); 500 } else if(c==u_leftCurlyBrace) { 501 index=parseArg(index-1, 1, nestingLevel, parseError, errorCode); 502 } else if((nestingLevel>0 && c==u_rightCurlyBrace) || 503 (parentType==UMSGPAT_ARG_TYPE_CHOICE && c==u_pipe)) { 504 // Finish the message before the terminator. 505 // In a choice style, report the "}" substring only for the following ARG_LIMIT, 506 // not for this MSG_LIMIT. 507 int32_t limitLength=(parentType==UMSGPAT_ARG_TYPE_CHOICE && c==u_rightCurlyBrace) ? 0 : 1; 508 addLimitPart(msgStart, UMSGPAT_PART_TYPE_MSG_LIMIT, index-1, limitLength, 509 nestingLevel, errorCode); 510 if(parentType==UMSGPAT_ARG_TYPE_CHOICE) { 511 // Let the choice style parser see the '}' or '|'. 512 return index-1; 513 } else { 514 // continue parsing after the '}' 515 return index; 516 } 517 } // else: c is part of literal text 518 } 519 if(nestingLevel>0 && !inTopLevelChoiceMessage(nestingLevel, parentType)) { 520 setParseError(parseError, 0); // Unmatched '{' braces in message. 521 errorCode=U_UNMATCHED_BRACES; 522 return 0; 523 } 524 addLimitPart(msgStart, UMSGPAT_PART_TYPE_MSG_LIMIT, index, 0, nestingLevel, errorCode); 525 return index; 526 } 527 528 int32_t 529 MessagePattern::parseArg(int32_t index, int32_t argStartLength, int32_t nestingLevel, 530 UParseError *parseError, UErrorCode &errorCode) { 531 int32_t argStart=partsLength; 532 UMessagePatternArgType argType=UMSGPAT_ARG_TYPE_NONE; 533 addPart(UMSGPAT_PART_TYPE_ARG_START, index, argStartLength, argType, errorCode); 534 if(U_FAILURE(errorCode)) { 535 return 0; 536 } 537 int32_t nameIndex=index=skipWhiteSpace(index+argStartLength); 538 if(index==msg.length()) { 539 setParseError(parseError, 0); // Unmatched '{' braces in message. 540 errorCode=U_UNMATCHED_BRACES; 541 return 0; 542 } 543 // parse argument name or number 544 index=skipIdentifier(index); 545 int32_t number=parseArgNumber(nameIndex, index); 546 if(number>=0) { 547 int32_t length=index-nameIndex; 548 if(length>Part::MAX_LENGTH || number>Part::MAX_VALUE) { 549 setParseError(parseError, nameIndex); // Argument number too large. 550 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 551 return 0; 552 } 553 hasArgNumbers=TRUE; 554 addPart(UMSGPAT_PART_TYPE_ARG_NUMBER, nameIndex, length, number, errorCode); 555 } else if(number==UMSGPAT_ARG_NAME_NOT_NUMBER) { 556 int32_t length=index-nameIndex; 557 if(length>Part::MAX_LENGTH) { 558 setParseError(parseError, nameIndex); // Argument name too long. 559 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 560 return 0; 561 } 562 hasArgNames=TRUE; 563 addPart(UMSGPAT_PART_TYPE_ARG_NAME, nameIndex, length, 0, errorCode); 564 } else { // number<-1 (ARG_NAME_NOT_VALID) 565 setParseError(parseError, nameIndex); // Bad argument syntax. 566 errorCode=U_PATTERN_SYNTAX_ERROR; 567 return 0; 568 } 569 index=skipWhiteSpace(index); 570 if(index==msg.length()) { 571 setParseError(parseError, 0); // Unmatched '{' braces in message. 572 errorCode=U_UNMATCHED_BRACES; 573 return 0; 574 } 575 UChar c=msg.charAt(index); 576 if(c==u_rightCurlyBrace) { 577 // all done 578 } else if(c!=u_comma) { 579 setParseError(parseError, nameIndex); // Bad argument syntax. 580 errorCode=U_PATTERN_SYNTAX_ERROR; 581 return 0; 582 } else /* ',' */ { 583 // parse argument type: case-sensitive a-zA-Z 584 int32_t typeIndex=index=skipWhiteSpace(index+1); 585 while(index<msg.length() && isArgTypeChar(msg.charAt(index))) { 586 ++index; 587 } 588 int32_t length=index-typeIndex; 589 index=skipWhiteSpace(index); 590 if(index==msg.length()) { 591 setParseError(parseError, 0); // Unmatched '{' braces in message. 592 errorCode=U_UNMATCHED_BRACES; 593 return 0; 594 } 595 if(length==0 || ((c=msg.charAt(index))!=u_comma && c!=u_rightCurlyBrace)) { 596 setParseError(parseError, nameIndex); // Bad argument syntax. 597 errorCode=U_PATTERN_SYNTAX_ERROR; 598 return 0; 599 } 600 if(length>Part::MAX_LENGTH) { 601 setParseError(parseError, nameIndex); // Argument type name too long. 602 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 603 return 0; 604 } 605 argType=UMSGPAT_ARG_TYPE_SIMPLE; 606 if(length==6) { 607 // case-insensitive comparisons for complex-type names 608 if(isChoice(typeIndex)) { 609 argType=UMSGPAT_ARG_TYPE_CHOICE; 610 } else if(isPlural(typeIndex)) { 611 argType=UMSGPAT_ARG_TYPE_PLURAL; 612 } else if(isSelect(typeIndex)) { 613 argType=UMSGPAT_ARG_TYPE_SELECT; 614 } 615 } 616 // change the ARG_START type from NONE to argType 617 partsList->a[argStart].value=(int16_t)argType; 618 if(argType==UMSGPAT_ARG_TYPE_SIMPLE) { 619 addPart(UMSGPAT_PART_TYPE_ARG_TYPE, typeIndex, length, 0, errorCode); 620 } 621 // look for an argument style (pattern) 622 if(c==u_rightCurlyBrace) { 623 if(argType!=UMSGPAT_ARG_TYPE_SIMPLE) { 624 setParseError(parseError, nameIndex); // No style field for complex argument. 625 errorCode=U_PATTERN_SYNTAX_ERROR; 626 return 0; 627 } 628 } else /* ',' */ { 629 ++index; 630 if(argType==UMSGPAT_ARG_TYPE_SIMPLE) { 631 index=parseSimpleStyle(index, parseError, errorCode); 632 } else if(argType==UMSGPAT_ARG_TYPE_CHOICE) { 633 index=parseChoiceStyle(index, nestingLevel, parseError, errorCode); 634 } else { 635 index=parsePluralOrSelectStyle(argType, index, nestingLevel, parseError, errorCode); 636 } 637 } 638 } 639 // Argument parsing stopped on the '}'. 640 addLimitPart(argStart, UMSGPAT_PART_TYPE_ARG_LIMIT, index, 1, argType, errorCode); 641 return index+1; 642 } 643 644 int32_t 645 MessagePattern::parseSimpleStyle(int32_t index, UParseError *parseError, UErrorCode &errorCode) { 646 if(U_FAILURE(errorCode)) { 647 return 0; 648 } 649 int32_t start=index; 650 int32_t nestedBraces=0; 651 while(index<msg.length()) { 652 UChar c=msg.charAt(index++); 653 if(c==u_apos) { 654 // Treat apostrophe as quoting but include it in the style part. 655 // Find the end of the quoted literal text. 656 index=msg.indexOf(u_apos, index); 657 if(index<0) { 658 // Quoted literal argument style text reaches to the end of the message. 659 setParseError(parseError, start); 660 errorCode=U_PATTERN_SYNTAX_ERROR; 661 return 0; 662 } 663 // skip the quote-ending apostrophe 664 ++index; 665 } else if(c==u_leftCurlyBrace) { 666 ++nestedBraces; 667 } else if(c==u_rightCurlyBrace) { 668 if(nestedBraces>0) { 669 --nestedBraces; 670 } else { 671 int32_t length=--index-start; 672 if(length>Part::MAX_LENGTH) { 673 setParseError(parseError, start); // Argument style text too long. 674 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 675 return 0; 676 } 677 addPart(UMSGPAT_PART_TYPE_ARG_STYLE, start, length, 0, errorCode); 678 return index; 679 } 680 } // c is part of literal text 681 } 682 setParseError(parseError, 0); // Unmatched '{' braces in message. 683 errorCode=U_UNMATCHED_BRACES; 684 return 0; 685 } 686 687 int32_t 688 MessagePattern::parseChoiceStyle(int32_t index, int32_t nestingLevel, 689 UParseError *parseError, UErrorCode &errorCode) { 690 if(U_FAILURE(errorCode)) { 691 return 0; 692 } 693 int32_t start=index; 694 index=skipWhiteSpace(index); 695 if(index==msg.length() || msg.charAt(index)==u_rightCurlyBrace) { 696 setParseError(parseError, 0); // Missing choice argument pattern. 697 errorCode=U_PATTERN_SYNTAX_ERROR; 698 return 0; 699 } 700 for(;;) { 701 // The choice argument style contains |-separated (number, separator, message) triples. 702 // Parse the number. 703 int32_t numberIndex=index; 704 index=skipDouble(index); 705 int32_t length=index-numberIndex; 706 if(length==0) { 707 setParseError(parseError, start); // Bad choice pattern syntax. 708 errorCode=U_PATTERN_SYNTAX_ERROR; 709 return 0; 710 } 711 if(length>Part::MAX_LENGTH) { 712 setParseError(parseError, numberIndex); // Choice number too long. 713 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 714 return 0; 715 } 716 parseDouble(numberIndex, index, TRUE, parseError, errorCode); // adds ARG_INT or ARG_DOUBLE 717 if(U_FAILURE(errorCode)) { 718 return 0; 719 } 720 // Parse the separator. 721 index=skipWhiteSpace(index); 722 if(index==msg.length()) { 723 setParseError(parseError, start); // Bad choice pattern syntax. 724 errorCode=U_PATTERN_SYNTAX_ERROR; 725 return 0; 726 } 727 UChar c=msg.charAt(index); 728 if(!(c==u_pound || c==u_lessThan || c==u_lessOrEqual)) { // U+2264 is <= 729 setParseError(parseError, start); // Expected choice separator (#<\u2264) instead of c. 730 errorCode=U_PATTERN_SYNTAX_ERROR; 731 return 0; 732 } 733 addPart(UMSGPAT_PART_TYPE_ARG_SELECTOR, index, 1, 0, errorCode); 734 // Parse the message fragment. 735 index=parseMessage(++index, 0, nestingLevel+1, UMSGPAT_ARG_TYPE_CHOICE, parseError, errorCode); 736 if(U_FAILURE(errorCode)) { 737 return 0; 738 } 739 // parseMessage(..., CHOICE) returns the index of the terminator, or msg.length(). 740 if(index==msg.length()) { 741 return index; 742 } 743 if(msg.charAt(index)==u_rightCurlyBrace) { 744 if(!inMessageFormatPattern(nestingLevel)) { 745 setParseError(parseError, start); // Bad choice pattern syntax. 746 errorCode=U_PATTERN_SYNTAX_ERROR; 747 return 0; 748 } 749 return index; 750 } // else the terminator is '|' 751 index=skipWhiteSpace(index+1); 752 } 753 } 754 755 int32_t 756 MessagePattern::parsePluralOrSelectStyle(UMessagePatternArgType argType, 757 int32_t index, int32_t nestingLevel, 758 UParseError *parseError, UErrorCode &errorCode) { 759 if(U_FAILURE(errorCode)) { 760 return 0; 761 } 762 int32_t start=index; 763 UBool isEmpty=TRUE; 764 UBool hasOther=FALSE; 765 for(;;) { 766 // First, collect the selector looking for a small set of terminators. 767 // It would be a little faster to consider the syntax of each possible 768 // token right here, but that makes the code too complicated. 769 index=skipWhiteSpace(index); 770 UBool eos=index==msg.length(); 771 if(eos || msg.charAt(index)==u_rightCurlyBrace) { 772 if(eos==inMessageFormatPattern(nestingLevel)) { 773 setParseError(parseError, start); // Bad plural/select pattern syntax. 774 errorCode=U_PATTERN_SYNTAX_ERROR; 775 return 0; 776 } 777 if(!hasOther) { 778 setParseError(parseError, 0); // Missing 'other' keyword in plural/select pattern. 779 errorCode=U_DEFAULT_KEYWORD_MISSING; 780 return 0; 781 } 782 return index; 783 } 784 int32_t selectorIndex=index; 785 if(argType==UMSGPAT_ARG_TYPE_PLURAL && msg.charAt(selectorIndex)==u_equal) { 786 // explicit-value plural selector: =double 787 index=skipDouble(index+1); 788 int32_t length=index-selectorIndex; 789 if(length==1) { 790 setParseError(parseError, start); // Bad plural/select pattern syntax. 791 errorCode=U_PATTERN_SYNTAX_ERROR; 792 return 0; 793 } 794 if(length>Part::MAX_LENGTH) { 795 setParseError(parseError, selectorIndex); // Argument selector too long. 796 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 797 return 0; 798 } 799 addPart(UMSGPAT_PART_TYPE_ARG_SELECTOR, selectorIndex, length, 0, errorCode); 800 parseDouble(selectorIndex+1, index, FALSE, 801 parseError, errorCode); // adds ARG_INT or ARG_DOUBLE 802 } else { 803 index=skipIdentifier(index); 804 int32_t length=index-selectorIndex; 805 if(length==0) { 806 setParseError(parseError, start); // Bad plural/select pattern syntax. 807 errorCode=U_PATTERN_SYNTAX_ERROR; 808 return 0; 809 } 810 // Note: The ':' in "offset:" is just beyond the skipIdentifier() range. 811 if( argType==UMSGPAT_ARG_TYPE_PLURAL && length==6 && index<msg.length() && 812 0==msg.compare(selectorIndex, 7, kOffsetColon, 0, 7) 813 ) { 814 // plural offset, not a selector 815 if(!isEmpty) { 816 // Plural argument 'offset:' (if present) must precede key-message pairs. 817 setParseError(parseError, start); 818 errorCode=U_PATTERN_SYNTAX_ERROR; 819 return 0; 820 } 821 // allow whitespace between offset: and its value 822 int32_t valueIndex=skipWhiteSpace(index+1); // The ':' is at index. 823 index=skipDouble(valueIndex); 824 if(index==valueIndex) { 825 setParseError(parseError, start); // Missing value for plural 'offset:'. 826 errorCode=U_PATTERN_SYNTAX_ERROR; 827 return 0; 828 } 829 if((index-valueIndex)>Part::MAX_LENGTH) { 830 setParseError(parseError, valueIndex); // Plural offset value too long. 831 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 832 return 0; 833 } 834 parseDouble(valueIndex, index, FALSE, 835 parseError, errorCode); // adds ARG_INT or ARG_DOUBLE 836 if(U_FAILURE(errorCode)) { 837 return 0; 838 } 839 isEmpty=FALSE; 840 continue; // no message fragment after the offset 841 } else { 842 // normal selector word 843 if(length>Part::MAX_LENGTH) { 844 setParseError(parseError, selectorIndex); // Argument selector too long. 845 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 846 return 0; 847 } 848 addPart(UMSGPAT_PART_TYPE_ARG_SELECTOR, selectorIndex, length, 0, errorCode); 849 if(0==msg.compare(selectorIndex, length, kOther, 0, 5)) { 850 hasOther=TRUE; 851 } 852 } 853 } 854 if(U_FAILURE(errorCode)) { 855 return 0; 856 } 857 858 // parse the message fragment following the selector 859 index=skipWhiteSpace(index); 860 if(index==msg.length() || msg.charAt(index)!=u_leftCurlyBrace) { 861 setParseError(parseError, selectorIndex); // No message fragment after plural/select selector. 862 errorCode=U_PATTERN_SYNTAX_ERROR; 863 return 0; 864 } 865 index=parseMessage(index, 1, nestingLevel+1, argType, parseError, errorCode); 866 if(U_FAILURE(errorCode)) { 867 return 0; 868 } 869 isEmpty=FALSE; 870 } 871 } 872 873 int32_t 874 MessagePattern::parseArgNumber(const UnicodeString &s, int32_t start, int32_t limit) { 875 // If the identifier contains only ASCII digits, then it is an argument _number_ 876 // and must not have leading zeros (except "0" itself). 877 // Otherwise it is an argument _name_. 878 if(start>=limit) { 879 return UMSGPAT_ARG_NAME_NOT_VALID; 880 } 881 int32_t number; 882 // Defer numeric errors until we know there are only digits. 883 UBool badNumber; 884 UChar c=s.charAt(start++); 885 if(c==0x30) { 886 if(start==limit) { 887 return 0; 888 } else { 889 number=0; 890 badNumber=TRUE; // leading zero 891 } 892 } else if(0x31<=c && c<=0x39) { 893 number=c-0x30; 894 badNumber=FALSE; 895 } else { 896 return UMSGPAT_ARG_NAME_NOT_NUMBER; 897 } 898 while(start<limit) { 899 c=s.charAt(start++); 900 if(0x30<=c && c<=0x39) { 901 if(number>=INT32_MAX/10) { 902 badNumber=TRUE; // overflow 903 } 904 number=number*10+(c-0x30); 905 } else { 906 return UMSGPAT_ARG_NAME_NOT_NUMBER; 907 } 908 } 909 // There are only ASCII digits. 910 if(badNumber) { 911 return UMSGPAT_ARG_NAME_NOT_VALID; 912 } else { 913 return number; 914 } 915 } 916 917 void 918 MessagePattern::parseDouble(int32_t start, int32_t limit, UBool allowInfinity, 919 UParseError *parseError, UErrorCode &errorCode) { 920 if(U_FAILURE(errorCode)) { 921 return; 922 } 923 U_ASSERT(start<limit); 924 // fake loop for easy exit and single throw statement 925 for(;;) { 926 // fast path for small integers and infinity 927 int32_t value=0; 928 int32_t isNegative=0; // not boolean so that we can easily add it to value 929 int32_t index=start; 930 UChar c=msg.charAt(index++); 931 if(c==u_minus) { 932 isNegative=1; 933 if(index==limit) { 934 break; // no number 935 } 936 c=msg.charAt(index++); 937 } else if(c==u_plus) { 938 if(index==limit) { 939 break; // no number 940 } 941 c=msg.charAt(index++); 942 } 943 if(c==0x221e) { // infinity 944 if(allowInfinity && index==limit) { 945 double infinity=uprv_getInfinity(); 946 addArgDoublePart( 947 isNegative!=0 ? -infinity : infinity, 948 start, limit-start, errorCode); 949 return; 950 } else { 951 break; 952 } 953 } 954 // try to parse the number as a small integer but fall back to a double 955 while('0'<=c && c<='9') { 956 value=value*10+(c-'0'); 957 if(value>(Part::MAX_VALUE+isNegative)) { 958 break; // not a small-enough integer 959 } 960 if(index==limit) { 961 addPart(UMSGPAT_PART_TYPE_ARG_INT, start, limit-start, 962 isNegative!=0 ? -value : value, errorCode); 963 return; 964 } 965 c=msg.charAt(index++); 966 } 967 // Let Double.parseDouble() throw a NumberFormatException. 968 char numberChars[128]; 969 int32_t capacity=(int32_t)sizeof(numberChars); 970 int32_t length=limit-start; 971 if(length>=capacity) { 972 break; // number too long 973 } 974 msg.extract(start, length, numberChars, capacity, US_INV); 975 if((int32_t)uprv_strlen(numberChars)<length) { 976 break; // contains non-invariant character that was turned into NUL 977 } 978 char *end; 979 double numericValue=uprv_strtod(numberChars, &end); 980 if(end!=(numberChars+length)) { 981 break; // parsing error 982 } 983 addArgDoublePart(numericValue, start, length, errorCode); 984 return; 985 } 986 setParseError(parseError, start /*, limit*/); // Bad syntax for numeric value. 987 errorCode=U_PATTERN_SYNTAX_ERROR; 988 return; 989 } 990 991 int32_t 992 MessagePattern::skipWhiteSpace(int32_t index) { 993 const UChar *s=msg.getBuffer(); 994 int32_t msgLength=msg.length(); 995 const UChar *t=PatternProps::skipWhiteSpace(s+index, msgLength-index); 996 return (int32_t)(t-s); 997 } 998 999 int32_t 1000 MessagePattern::skipIdentifier(int32_t index) { 1001 const UChar *s=msg.getBuffer(); 1002 int32_t msgLength=msg.length(); 1003 const UChar *t=PatternProps::skipIdentifier(s+index, msgLength-index); 1004 return (int32_t)(t-s); 1005 } 1006 1007 int32_t 1008 MessagePattern::skipDouble(int32_t index) { 1009 int32_t msgLength=msg.length(); 1010 while(index<msgLength) { 1011 UChar c=msg.charAt(index); 1012 // U+221E: Allow the infinity symbol, for ChoiceFormat patterns. 1013 if((c<0x30 && c!=u_plus && c!=u_minus && c!=u_dot) || (c>0x39 && c!=u_e && c!=u_E && c!=0x221e)) { 1014 break; 1015 } 1016 ++index; 1017 } 1018 return index; 1019 } 1020 1021 UBool 1022 MessagePattern::isArgTypeChar(UChar32 c) { 1023 return (u_a<=c && c<=u_z) || (u_A<=c && c<=u_Z); 1024 } 1025 1026 UBool 1027 MessagePattern::isChoice(int32_t index) { 1028 UChar c; 1029 return 1030 ((c=msg.charAt(index++))==u_c || c==u_C) && 1031 ((c=msg.charAt(index++))==u_h || c==u_H) && 1032 ((c=msg.charAt(index++))==u_o || c==u_O) && 1033 ((c=msg.charAt(index++))==u_i || c==u_I) && 1034 ((c=msg.charAt(index++))==u_c || c==u_C) && 1035 ((c=msg.charAt(index))==u_e || c==u_E); 1036 } 1037 1038 UBool 1039 MessagePattern::isPlural(int32_t index) { 1040 UChar c; 1041 return 1042 ((c=msg.charAt(index++))==u_p || c==u_P) && 1043 ((c=msg.charAt(index++))==u_l || c==u_L) && 1044 ((c=msg.charAt(index++))==u_u || c==u_U) && 1045 ((c=msg.charAt(index++))==u_r || c==u_R) && 1046 ((c=msg.charAt(index++))==u_a || c==u_A) && 1047 ((c=msg.charAt(index))==u_l || c==u_L); 1048 } 1049 1050 UBool 1051 MessagePattern::isSelect(int32_t index) { 1052 UChar c; 1053 return 1054 ((c=msg.charAt(index++))==u_s || c==u_S) && 1055 ((c=msg.charAt(index++))==u_e || c==u_E) && 1056 ((c=msg.charAt(index++))==u_l || c==u_L) && 1057 ((c=msg.charAt(index++))==u_e || c==u_E) && 1058 ((c=msg.charAt(index++))==u_c || c==u_C) && 1059 ((c=msg.charAt(index))==u_t || c==u_T); 1060 } 1061 1062 UBool 1063 MessagePattern::inMessageFormatPattern(int32_t nestingLevel) { 1064 return nestingLevel>0 || partsList->a[0].type==UMSGPAT_PART_TYPE_MSG_START; 1065 } 1066 1067 UBool 1068 MessagePattern::inTopLevelChoiceMessage(int32_t nestingLevel, UMessagePatternArgType parentType) { 1069 return 1070 nestingLevel==1 && 1071 parentType==UMSGPAT_ARG_TYPE_CHOICE && 1072 partsList->a[0].type!=UMSGPAT_PART_TYPE_MSG_START; 1073 } 1074 1075 void 1076 MessagePattern::addPart(UMessagePatternPartType type, int32_t index, int32_t length, 1077 int32_t value, UErrorCode &errorCode) { 1078 if(partsList->ensureCapacityForOneMore(partsLength, errorCode)) { 1079 Part &part=partsList->a[partsLength++]; 1080 part.type=type; 1081 part.index=index; 1082 part.length=(uint16_t)length; 1083 part.value=(int16_t)value; 1084 part.limitPartIndex=0; 1085 } 1086 } 1087 1088 void 1089 MessagePattern::addLimitPart(int32_t start, 1090 UMessagePatternPartType type, int32_t index, int32_t length, 1091 int32_t value, UErrorCode &errorCode) { 1092 partsList->a[start].limitPartIndex=partsLength; 1093 addPart(type, index, length, value, errorCode); 1094 } 1095 1096 void 1097 MessagePattern::addArgDoublePart(double numericValue, int32_t start, int32_t length, 1098 UErrorCode &errorCode) { 1099 if(U_FAILURE(errorCode)) { 1100 return; 1101 } 1102 int32_t numericIndex=numericValuesLength; 1103 if(numericValuesList==NULL) { 1104 numericValuesList=new MessagePatternDoubleList(); 1105 if(numericValuesList==NULL) { 1106 errorCode=U_MEMORY_ALLOCATION_ERROR; 1107 return; 1108 } 1109 } else if(!numericValuesList->ensureCapacityForOneMore(numericValuesLength, errorCode)) { 1110 return; 1111 } else { 1112 if(numericIndex>Part::MAX_VALUE) { 1113 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1114 return; 1115 } 1116 } 1117 numericValuesList->a[numericValuesLength++]=numericValue; 1118 addPart(UMSGPAT_PART_TYPE_ARG_DOUBLE, start, length, numericIndex, errorCode); 1119 } 1120 1121 void 1122 MessagePattern::setParseError(UParseError *parseError, int32_t index) { 1123 if(parseError==NULL) { 1124 return; 1125 } 1126 parseError->offset=index; 1127 1128 // Set preContext to some of msg before index. 1129 // Avoid splitting a surrogate pair. 1130 int32_t length=index; 1131 if(length>=U_PARSE_CONTEXT_LEN) { 1132 length=U_PARSE_CONTEXT_LEN-1; 1133 if(length>0 && U16_IS_TRAIL(msg[index-length])) { 1134 --length; 1135 } 1136 } 1137 msg.extract(index-length, length, parseError->preContext); 1138 parseError->preContext[length]=0; 1139 1140 // Set postContext to some of msg starting at index. 1141 length=msg.length()-index; 1142 if(length>=U_PARSE_CONTEXT_LEN) { 1143 length=U_PARSE_CONTEXT_LEN-1; 1144 if(length>0 && U16_IS_LEAD(msg[index+length-1])) { 1145 --length; 1146 } 1147 } 1148 msg.extract(index, length, parseError->postContext); 1149 parseError->postContext[length]=0; 1150 } 1151 1152 UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(MessagePattern) 1153 1154 // MessageImpl ------------------------------------------------------------- *** 1155 1156 void 1157 MessageImpl::appendReducedApostrophes(const UnicodeString &s, int32_t start, int32_t limit, 1158 UnicodeString &sb) { 1159 int32_t doubleApos=-1; 1160 for(;;) { 1161 int32_t i=s.indexOf(u_apos, start); 1162 if(i<0 || i>=limit) { 1163 sb.append(s, start, limit-start); 1164 break; 1165 } 1166 if(i==doubleApos) { 1167 // Double apostrophe at start-1 and start==i, append one. 1168 sb.append(u_apos); 1169 ++start; 1170 doubleApos=-1; 1171 } else { 1172 // Append text between apostrophes and skip this one. 1173 sb.append(s, start, i-start); 1174 doubleApos=start=i+1; 1175 } 1176 } 1177 } 1178 1179 // Ported from second half of ICU4J SelectFormat.format(String). 1180 UnicodeString & 1181 MessageImpl::appendSubMessageWithoutSkipSyntax(const MessagePattern &msgPattern, 1182 int32_t msgStart, 1183 UnicodeString &result) { 1184 const UnicodeString &msgString=msgPattern.getPatternString(); 1185 int32_t prevIndex=msgPattern.getPart(msgStart).getLimit(); 1186 for(int32_t i=msgStart;;) { 1187 const MessagePattern::Part &part=msgPattern.getPart(++i); 1188 UMessagePatternPartType type=part.getType(); 1189 int32_t index=part.getIndex(); 1190 if(type==UMSGPAT_PART_TYPE_MSG_LIMIT) { 1191 return result.append(msgString, prevIndex, index-prevIndex); 1192 } else if(type==UMSGPAT_PART_TYPE_SKIP_SYNTAX) { 1193 result.append(msgString, prevIndex, index-prevIndex); 1194 prevIndex=part.getLimit(); 1195 } else if(type==UMSGPAT_PART_TYPE_ARG_START) { 1196 result.append(msgString, prevIndex, index-prevIndex); 1197 prevIndex=index; 1198 i=msgPattern.getLimitPartIndex(i); 1199 index=msgPattern.getPart(i).getLimit(); 1200 appendReducedApostrophes(msgString, prevIndex, index, result); 1201 prevIndex=index; 1202 } 1203 } 1204 } 1205 1206 U_NAMESPACE_END 1207 1208 #endif // !UCONFIG_NO_FORMATTING 1209