1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2016, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11 package android.icu.impl; 12 13 import java.io.IOException; 14 import java.nio.ByteBuffer; 15 import java.util.Iterator; 16 import java.util.MissingResourceException; 17 18 import android.icu.lang.UCharacter; 19 import android.icu.lang.UCharacter.HangulSyllableType; 20 import android.icu.lang.UCharacter.NumericType; 21 import android.icu.lang.UCharacterCategory; 22 import android.icu.lang.UProperty; 23 import android.icu.lang.UScript; 24 import android.icu.text.Normalizer2; 25 import android.icu.text.UTF16; 26 import android.icu.text.UnicodeSet; 27 import android.icu.util.ICUException; 28 import android.icu.util.VersionInfo; 29 30 /** 31 * <p>Internal class used for Unicode character property database.</p> 32 * <p>This classes store binary data read from uprops.icu. 33 * It does not have the capability to parse the data into more high-level 34 * information. It only returns bytes of information when required.</p> 35 * <p>Due to the form most commonly used for retrieval, array of char is used 36 * to store the binary data.</p> 37 * <p>UCharacterPropertyDB also contains information on accessing indexes to 38 * significant points in the binary data.</p> 39 * <p>Responsibility for molding the binary data into more meaning form lies on 40 * <a href=UCharacter.html>UCharacter</a>.</p> 41 * @author Syn Wee Quek 42 * @hide Only a subset of ICU is exposed in Android 43 */ 44 45 public final class UCharacterProperty 46 { 47 // public data members ----------------------------------------------- 48 49 /* 50 * public singleton instance 51 */ 52 public static final UCharacterProperty INSTANCE; 53 54 /** 55 * Trie data 56 */ 57 public Trie2_16 m_trie_; 58 /** 59 * Unicode version 60 */ 61 public VersionInfo m_unicodeVersion_; 62 /** 63 * Latin capital letter i with dot above 64 */ 65 public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130; 66 /** 67 * Latin small letter i with dot above 68 */ 69 public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131; 70 /** 71 * Latin lowercase i 72 */ 73 public static final char LATIN_SMALL_LETTER_I_ = 0x69; 74 /** 75 * Character type mask 76 */ 77 public static final int TYPE_MASK = 0x1F; 78 79 // uprops.h enum UPropertySource --------------------------------------- *** 80 81 /** No source, not a supported property. */ 82 public static final int SRC_NONE=0; 83 /** From uchar.c/uprops.icu main trie */ 84 public static final int SRC_CHAR=1; 85 /** From uchar.c/uprops.icu properties vectors trie */ 86 public static final int SRC_PROPSVEC=2; 87 /** From unames.c/unames.icu */ 88 public static final int SRC_NAMES=3; 89 /** From ucase.c/ucase.icu */ 90 public static final int SRC_CASE=4; 91 /** From ubidi_props.c/ubidi.icu */ 92 public static final int SRC_BIDI=5; 93 /** From uchar.c/uprops.icu main trie as well as properties vectors trie */ 94 public static final int SRC_CHAR_AND_PROPSVEC=6; 95 /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */ 96 public static final int SRC_CASE_AND_NORM=7; 97 /** From normalizer2impl.cpp/nfc.nrm */ 98 public static final int SRC_NFC=8; 99 /** From normalizer2impl.cpp/nfkc.nrm */ 100 public static final int SRC_NFKC=9; 101 /** From normalizer2impl.cpp/nfkc_cf.nrm */ 102 public static final int SRC_NFKC_CF=10; 103 /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */ 104 public static final int SRC_NFC_CANON_ITER=11; 105 /** One more than the highest UPropertySource (SRC_) constant. */ 106 public static final int SRC_COUNT=12; 107 108 // public methods ---------------------------------------------------- 109 110 /** 111 * Gets the main property value for code point ch. 112 * @param ch code point whose property value is to be retrieved 113 * @return property value of code point 114 */ 115 public final int getProperty(int ch) 116 { 117 return m_trie_.get(ch); 118 } 119 120 /** 121 * Gets the unicode additional properties. 122 * Java version of C u_getUnicodeProperties(). 123 * @param codepoint codepoint whose additional properties is to be 124 * retrieved 125 * @param column The column index. 126 * @return unicode properties 127 */ 128 public int getAdditional(int codepoint, int column) { 129 assert column >= 0; 130 if (column >= m_additionalColumnsCount_) { 131 return 0; 132 } 133 return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; 134 } 135 136 static final int MY_MASK = UCharacterProperty.TYPE_MASK 137 & ((1<<UCharacterCategory.UPPERCASE_LETTER) | 138 (1<<UCharacterCategory.LOWERCASE_LETTER) | 139 (1<<UCharacterCategory.TITLECASE_LETTER) | 140 (1<<UCharacterCategory.MODIFIER_LETTER) | 141 (1<<UCharacterCategory.OTHER_LETTER)); 142 143 144 /** 145 * <p>Get the "age" of the code point.</p> 146 * <p>The "age" is the Unicode version when the code point was first 147 * designated (as a non-character or for Private Use) or assigned a 148 * character.</p> 149 * <p>This can be useful to avoid emitting code points to receiving 150 * processes that do not accept newer characters.</p> 151 * <p>The data is from the UCD file DerivedAge.txt.</p> 152 * <p>This API does not check the validity of the codepoint.</p> 153 * @param codepoint The code point. 154 * @return the Unicode version number 155 */ 156 public VersionInfo getAge(int codepoint) 157 { 158 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; 159 return VersionInfo.getInstance( 160 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, 161 version & LAST_NIBBLE_MASK_, 0, 0); 162 } 163 164 private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED); 165 private static final int GC_CC_MASK = getMask(UCharacter.CONTROL); 166 private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE); 167 private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR); 168 private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR); 169 private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR); 170 /** Mask constant for multiple UCharCategory bits (Z Separators). */ 171 private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK; 172 173 /** 174 * Checks if c is in 175 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] 176 * with space=\p{Whitespace} and Control=Cc. 177 * Implements UCHAR_POSIX_GRAPH. 178 * @hide draft / provisional / internal are hidden on Android 179 */ 180 private static final boolean isgraphPOSIX(int c) { 181 /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ 182 /* comparing ==0 returns FALSE for the categories mentioned */ 183 return (getMask(UCharacter.getType(c))& 184 (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK)) 185 ==0; 186 } 187 188 // binary properties --------------------------------------------------- *** 189 190 private class BinaryProperty { 191 int column; // SRC_PROPSVEC column, or "source" if mask==0 192 int mask; 193 BinaryProperty(int column, int mask) { 194 this.column=column; 195 this.mask=mask; 196 } 197 BinaryProperty(int source) { 198 this.column=source; 199 this.mask=0; 200 } 201 final int getSource() { 202 return mask==0 ? column : SRC_PROPSVEC; 203 } 204 boolean contains(int c) { 205 // systematic, directly stored properties 206 return (getAdditional(c, column)&mask)!=0; 207 } 208 } 209 210 private class CaseBinaryProperty extends BinaryProperty { // case mapping properties 211 int which; 212 CaseBinaryProperty(int which) { 213 super(SRC_CASE); 214 this.which=which; 215 } 216 @Override 217 boolean contains(int c) { 218 return UCaseProps.INSTANCE.hasBinaryProperty(c, which); 219 } 220 } 221 222 private class NormInertBinaryProperty extends BinaryProperty { // UCHAR_NF*_INERT properties 223 int which; 224 NormInertBinaryProperty(int source, int which) { 225 super(source); 226 this.which=which; 227 } 228 @Override 229 boolean contains(int c) { 230 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c); 231 } 232 } 233 234 BinaryProperty[] binProps={ 235 /* 236 * Binary-property implementations must be in order of corresponding UProperty, 237 * and there must be exactly one entry per binary UProperty. 238 */ 239 new BinaryProperty(1, (1<<ALPHABETIC_PROPERTY_)), 240 new BinaryProperty(1, (1<<ASCII_HEX_DIGIT_PROPERTY_)), 241 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_CONTROL 242 @Override 243 boolean contains(int c) { 244 return UBiDiProps.INSTANCE.isBidiControl(c); 245 } 246 }, 247 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_MIRRORED 248 @Override 249 boolean contains(int c) { 250 return UBiDiProps.INSTANCE.isMirrored(c); 251 } 252 }, 253 new BinaryProperty(1, (1<<DASH_PROPERTY_)), 254 new BinaryProperty(1, (1<<DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_)), 255 new BinaryProperty(1, (1<<DEPRECATED_PROPERTY_)), 256 new BinaryProperty(1, (1<<DIACRITIC_PROPERTY_)), 257 new BinaryProperty(1, (1<<EXTENDER_PROPERTY_)), 258 new BinaryProperty(SRC_NFC) { // UCHAR_FULL_COMPOSITION_EXCLUSION 259 @Override 260 boolean contains(int c) { 261 // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. 262 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl; 263 return impl.isCompNo(impl.getNorm16(c)); 264 } 265 }, 266 new BinaryProperty(1, (1<<GRAPHEME_BASE_PROPERTY_)), 267 new BinaryProperty(1, (1<<GRAPHEME_EXTEND_PROPERTY_)), 268 new BinaryProperty(1, (1<<GRAPHEME_LINK_PROPERTY_)), 269 new BinaryProperty(1, (1<<HEX_DIGIT_PROPERTY_)), 270 new BinaryProperty(1, (1<<HYPHEN_PROPERTY_)), 271 new BinaryProperty(1, (1<<ID_CONTINUE_PROPERTY_)), 272 new BinaryProperty(1, (1<<ID_START_PROPERTY_)), 273 new BinaryProperty(1, (1<<IDEOGRAPHIC_PROPERTY_)), 274 new BinaryProperty(1, (1<<IDS_BINARY_OPERATOR_PROPERTY_)), 275 new BinaryProperty(1, (1<<IDS_TRINARY_OPERATOR_PROPERTY_)), 276 new BinaryProperty(SRC_BIDI) { // UCHAR_JOIN_CONTROL 277 @Override 278 boolean contains(int c) { 279 return UBiDiProps.INSTANCE.isJoinControl(c); 280 } 281 }, 282 new BinaryProperty(1, (1<<LOGICAL_ORDER_EXCEPTION_PROPERTY_)), 283 new CaseBinaryProperty(UProperty.LOWERCASE), 284 new BinaryProperty(1, (1<<MATH_PROPERTY_)), 285 new BinaryProperty(1, (1<<NONCHARACTER_CODE_POINT_PROPERTY_)), 286 new BinaryProperty(1, (1<<QUOTATION_MARK_PROPERTY_)), 287 new BinaryProperty(1, (1<<RADICAL_PROPERTY_)), 288 new CaseBinaryProperty(UProperty.SOFT_DOTTED), 289 new BinaryProperty(1, (1<<TERMINAL_PUNCTUATION_PROPERTY_)), 290 new BinaryProperty(1, (1<<UNIFIED_IDEOGRAPH_PROPERTY_)), 291 new CaseBinaryProperty(UProperty.UPPERCASE), 292 new BinaryProperty(1, (1<<WHITE_SPACE_PROPERTY_)), 293 new BinaryProperty(1, (1<<XID_CONTINUE_PROPERTY_)), 294 new BinaryProperty(1, (1<<XID_START_PROPERTY_)), 295 new CaseBinaryProperty(UProperty.CASE_SENSITIVE), 296 new BinaryProperty(1, (1<<S_TERM_PROPERTY_)), 297 new BinaryProperty(1, (1<<VARIATION_SELECTOR_PROPERTY_)), 298 new NormInertBinaryProperty(SRC_NFC, UProperty.NFD_INERT), 299 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKD_INERT), 300 new NormInertBinaryProperty(SRC_NFC, UProperty.NFC_INERT), 301 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKC_INERT), 302 new BinaryProperty(SRC_NFC_CANON_ITER) { // UCHAR_SEGMENT_STARTER 303 @Override 304 boolean contains(int c) { 305 return Norm2AllModes.getNFCInstance().impl. 306 ensureCanonIterData().isCanonSegmentStarter(c); 307 } 308 }, 309 new BinaryProperty(1, (1<<PATTERN_SYNTAX)), 310 new BinaryProperty(1, (1<<PATTERN_WHITE_SPACE)), 311 new BinaryProperty(SRC_CHAR_AND_PROPSVEC) { // UCHAR_POSIX_ALNUM 312 @Override 313 boolean contains(int c) { 314 return UCharacter.isUAlphabetic(c) || UCharacter.isDigit(c); 315 } 316 }, 317 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_BLANK 318 @Override 319 boolean contains(int c) { 320 // "horizontal space" 321 if(c<=0x9f) { 322 return c==9 || c==0x20; /* TAB or SPACE */ 323 } else { 324 /* Zs */ 325 return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR; 326 } 327 } 328 }, 329 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_GRAPH 330 @Override 331 boolean contains(int c) { 332 return isgraphPOSIX(c); 333 } 334 }, 335 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_PRINT 336 @Override 337 boolean contains(int c) { 338 /* 339 * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}. 340 * 341 * The only cntrl character in graph+blank is TAB (in blank). 342 * Here we implement (blank-TAB)=Zs instead of calling u_isblank(). 343 */ 344 return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c); 345 } 346 }, 347 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_XDIGIT 348 @Override 349 boolean contains(int c) { 350 /* check ASCII and Fullwidth ASCII a-fA-F */ 351 if( 352 (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) || 353 (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41)) 354 ) { 355 return true; 356 } 357 return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER; 358 } 359 }, 360 new CaseBinaryProperty(UProperty.CASED), 361 new CaseBinaryProperty(UProperty.CASE_IGNORABLE), 362 new CaseBinaryProperty(UProperty.CHANGES_WHEN_LOWERCASED), 363 new CaseBinaryProperty(UProperty.CHANGES_WHEN_UPPERCASED), 364 new CaseBinaryProperty(UProperty.CHANGES_WHEN_TITLECASED), 365 new BinaryProperty(SRC_CASE_AND_NORM) { // UCHAR_CHANGES_WHEN_CASEFOLDED 366 @Override 367 boolean contains(int c) { 368 String nfd=Norm2AllModes.getNFCInstance().impl.getDecomposition(c); 369 if(nfd!=null) { 370 /* c has a decomposition */ 371 c=nfd.codePointAt(0); 372 if(Character.charCount(c)!=nfd.length()) { 373 /* multiple code points */ 374 c=-1; 375 } 376 } else if(c<0) { 377 return false; /* protect against bad input */ 378 } 379 if(c>=0) { 380 /* single code point */ 381 UCaseProps csp=UCaseProps.INSTANCE; 382 UCaseProps.dummyStringBuilder.setLength(0); 383 return csp.toFullFolding(c, UCaseProps.dummyStringBuilder, 384 UCharacter.FOLD_CASE_DEFAULT)>=0; 385 } else { 386 String folded=UCharacter.foldCase(nfd, true); 387 return !folded.equals(nfd); 388 } 389 } 390 }, 391 new CaseBinaryProperty(UProperty.CHANGES_WHEN_CASEMAPPED), 392 new BinaryProperty(SRC_NFKC_CF) { // UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED 393 @Override 394 boolean contains(int c) { 395 Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstance().impl; 396 String src=UTF16.valueOf(c); 397 StringBuilder dest=new StringBuilder(); 398 // Small destCapacity for NFKC_CF(c). 399 Normalizer2Impl.ReorderingBuffer buffer=new Normalizer2Impl.ReorderingBuffer(kcf, dest, 5); 400 kcf.compose(src, 0, src.length(), false, true, buffer); 401 return !Normalizer2Impl.UTF16Plus.equal(dest, src); 402 } 403 }, 404 new BinaryProperty(2, 1<<PROPS_2_EMOJI), 405 new BinaryProperty(2, 1<<PROPS_2_EMOJI_PRESENTATION), 406 new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER), 407 new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER_BASE), 408 }; 409 410 public boolean hasBinaryProperty(int c, int which) { 411 if(which<UProperty.BINARY_START || UProperty.BINARY_LIMIT<=which) { 412 // not a known binary property 413 return false; 414 } else { 415 return binProps[which].contains(c); 416 } 417 } 418 419 // int-value and enumerated properties --------------------------------- *** 420 421 public int getType(int c) { 422 return getProperty(c)&TYPE_MASK; 423 } 424 425 /* 426 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. 427 * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. 428 */ 429 private static final int /* UHangulSyllableType */ gcbToHst[]={ 430 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ 431 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ 432 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ 433 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ 434 HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ 435 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ 436 HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ 437 HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ 438 HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ 439 HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ 440 /* 441 * Omit GCB values beyond what we need for hst. 442 * The code below checks for the array length. 443 */ 444 }; 445 446 private class IntProperty { 447 int column; // SRC_PROPSVEC column, or "source" if mask==0 448 int mask; 449 int shift; 450 IntProperty(int column, int mask, int shift) { 451 this.column=column; 452 this.mask=mask; 453 this.shift=shift; 454 } 455 IntProperty(int source) { 456 this.column=source; 457 this.mask=0; 458 } 459 final int getSource() { 460 return mask==0 ? column : SRC_PROPSVEC; 461 } 462 int getValue(int c) { 463 // systematic, directly stored properties 464 return (getAdditional(c, column)&mask)>>>shift; 465 } 466 int getMaxValue(int which) { 467 return (getMaxValues(column)&mask)>>>shift; 468 } 469 } 470 471 private class BiDiIntProperty extends IntProperty { 472 BiDiIntProperty() { 473 super(SRC_BIDI); 474 } 475 @Override 476 int getMaxValue(int which) { 477 return UBiDiProps.INSTANCE.getMaxValue(which); 478 } 479 } 480 481 private class CombiningClassIntProperty extends IntProperty { 482 CombiningClassIntProperty(int source) { 483 super(source); 484 } 485 @Override 486 int getMaxValue(int which) { 487 return 0xff; 488 } 489 } 490 491 private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties 492 int which; 493 int max; 494 NormQuickCheckIntProperty(int source, int which, int max) { 495 super(source); 496 this.which=which; 497 this.max=max; 498 } 499 @Override 500 int getValue(int c) { 501 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_QUICK_CHECK).getQuickCheck(c); 502 } 503 @Override 504 int getMaxValue(int which) { 505 return max; 506 } 507 } 508 509 IntProperty intProps[]={ 510 new BiDiIntProperty() { // BIDI_CLASS 511 @Override 512 int getValue(int c) { 513 return UBiDiProps.INSTANCE.getClass(c); 514 } 515 }, 516 new IntProperty(0, BLOCK_MASK_, BLOCK_SHIFT_), 517 new CombiningClassIntProperty(SRC_NFC) { // CANONICAL_COMBINING_CLASS 518 @Override 519 int getValue(int c) { 520 return Normalizer2.getNFDInstance().getCombiningClass(c); 521 } 522 }, 523 new IntProperty(2, DECOMPOSITION_TYPE_MASK_, 0), 524 new IntProperty(0, EAST_ASIAN_MASK_, EAST_ASIAN_SHIFT_), 525 new IntProperty(SRC_CHAR) { // GENERAL_CATEGORY 526 @Override 527 int getValue(int c) { 528 return getType(c); 529 } 530 @Override 531 int getMaxValue(int which) { 532 return UCharacterCategory.CHAR_CATEGORY_COUNT-1; 533 } 534 }, 535 new BiDiIntProperty() { // JOINING_GROUP 536 @Override 537 int getValue(int c) { 538 return UBiDiProps.INSTANCE.getJoiningGroup(c); 539 } 540 }, 541 new BiDiIntProperty() { // JOINING_TYPE 542 @Override 543 int getValue(int c) { 544 return UBiDiProps.INSTANCE.getJoiningType(c); 545 } 546 }, 547 new IntProperty(2, LB_MASK, LB_SHIFT), // LINE_BREAK 548 new IntProperty(SRC_CHAR) { // NUMERIC_TYPE 549 @Override 550 int getValue(int c) { 551 return ntvGetType(getNumericTypeValue(getProperty(c))); 552 } 553 @Override 554 int getMaxValue(int which) { 555 return NumericType.COUNT-1; 556 } 557 }, 558 new IntProperty(0, SCRIPT_MASK_, 0) { 559 @Override 560 int getValue(int c) { 561 return UScript.getScript(c); 562 } 563 }, 564 new IntProperty(SRC_PROPSVEC) { // HANGUL_SYLLABLE_TYPE 565 @Override 566 int getValue(int c) { 567 /* see comments on gcbToHst[] above */ 568 int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT; 569 if(gcb<gcbToHst.length) { 570 return gcbToHst[gcb]; 571 } else { 572 return HangulSyllableType.NOT_APPLICABLE; 573 } 574 } 575 @Override 576 int getMaxValue(int which) { 577 return HangulSyllableType.COUNT-1; 578 } 579 }, 580 // max=1=YES -- these are never "maybe", only "no" or "yes" 581 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFD_QUICK_CHECK, 1), 582 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKD_QUICK_CHECK, 1), 583 // max=2=MAYBE 584 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFC_QUICK_CHECK, 2), 585 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKC_QUICK_CHECK, 2), 586 new CombiningClassIntProperty(SRC_NFC) { // LEAD_CANONICAL_COMBINING_CLASS 587 @Override 588 int getValue(int c) { 589 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)>>8; 590 } 591 }, 592 new CombiningClassIntProperty(SRC_NFC) { // TRAIL_CANONICAL_COMBINING_CLASS 593 @Override 594 int getValue(int c) { 595 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)&0xff; 596 } 597 }, 598 new IntProperty(2, GCB_MASK, GCB_SHIFT), // GRAPHEME_CLUSTER_BREAK 599 new IntProperty(2, SB_MASK, SB_SHIFT), // SENTENCE_BREAK 600 new IntProperty(2, WB_MASK, WB_SHIFT), // WORD_BREAK 601 new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE 602 @Override 603 int getValue(int c) { 604 return UBiDiProps.INSTANCE.getPairedBracketType(c); 605 } 606 }, 607 }; 608 609 public int getIntPropertyValue(int c, int which) { 610 if(which<UProperty.INT_START) { 611 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) { 612 return binProps[which].contains(c) ? 1 : 0; 613 } 614 } else if(which<UProperty.INT_LIMIT) { 615 return intProps[which-UProperty.INT_START].getValue(c); 616 } else if (which == UProperty.GENERAL_CATEGORY_MASK) { 617 return getMask(getType(c)); 618 } 619 return 0; // undefined 620 } 621 622 public int getIntPropertyMaxValue(int which) { 623 if(which<UProperty.INT_START) { 624 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) { 625 return 1; // maximum TRUE for all binary properties 626 } 627 } else if(which<UProperty.INT_LIMIT) { 628 return intProps[which-UProperty.INT_START].getMaxValue(which); 629 } 630 return -1; // undefined 631 } 632 633 public final int getSource(int which) { 634 if(which<UProperty.BINARY_START) { 635 return SRC_NONE; /* undefined */ 636 } else if(which<UProperty.BINARY_LIMIT) { 637 return binProps[which].getSource(); 638 } else if(which<UProperty.INT_START) { 639 return SRC_NONE; /* undefined */ 640 } else if(which<UProperty.INT_LIMIT) { 641 return intProps[which-UProperty.INT_START].getSource(); 642 } else if(which<UProperty.STRING_START) { 643 switch(which) { 644 case UProperty.GENERAL_CATEGORY_MASK: 645 case UProperty.NUMERIC_VALUE: 646 return SRC_CHAR; 647 648 default: 649 return SRC_NONE; 650 } 651 } else if(which<UProperty.STRING_LIMIT) { 652 switch(which) { 653 case UProperty.AGE: 654 return SRC_PROPSVEC; 655 656 case UProperty.BIDI_MIRRORING_GLYPH: 657 return SRC_BIDI; 658 659 case UProperty.CASE_FOLDING: 660 case UProperty.LOWERCASE_MAPPING: 661 case UProperty.SIMPLE_CASE_FOLDING: 662 case UProperty.SIMPLE_LOWERCASE_MAPPING: 663 case UProperty.SIMPLE_TITLECASE_MAPPING: 664 case UProperty.SIMPLE_UPPERCASE_MAPPING: 665 case UProperty.TITLECASE_MAPPING: 666 case UProperty.UPPERCASE_MAPPING: 667 return SRC_CASE; 668 669 case UProperty.ISO_COMMENT: 670 case UProperty.NAME: 671 case UProperty.UNICODE_1_NAME: 672 return SRC_NAMES; 673 674 default: 675 return SRC_NONE; 676 } 677 } else { 678 switch(which) { 679 case UProperty.SCRIPT_EXTENSIONS: 680 return SRC_PROPSVEC; 681 default: 682 return SRC_NONE; /* undefined */ 683 } 684 } 685 } 686 687 /** 688 * <p> 689 * Unicode property names and property value names are compared 690 * "loosely". Property[Value]Aliases.txt say: 691 * <quote> 692 * "With loose matching of property names, the case distinctions, 693 * whitespace, and '_' are ignored." 694 * </quote> 695 * </p> 696 * <p> 697 * This function does just that, for ASCII (char *) name strings. 698 * It is almost identical to ucnv_compareNames() but also ignores 699 * ASCII White_Space characters (U+0009..U+000d). 700 * </p> 701 * @param name1 name to compare 702 * @param name2 name to compare 703 * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0 704 * if name1 is greater than name2. 705 */ 706 /* to be implemented in 2.4 707 * public static int comparePropertyNames(String name1, String name2) 708 { 709 int result = 0; 710 int i1 = 0; 711 int i2 = 0; 712 while (true) { 713 char ch1 = 0; 714 char ch2 = 0; 715 // Ignore delimiters '-', '_', and ASCII White_Space 716 if (i1 < name1.length()) { 717 ch1 = name1.charAt(i1 ++); 718 } 719 while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t' 720 || ch1 == '\n' // synwee what is || ch1 == '\v' 721 || ch1 == '\f' || ch1=='\r') { 722 if (i1 < name1.length()) { 723 ch1 = name1.charAt(i1 ++); 724 } 725 else { 726 ch1 = 0; 727 } 728 } 729 if (i2 < name2.length()) { 730 ch2 = name2.charAt(i2 ++); 731 } 732 while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t' 733 || ch2 == '\n' // synwee what is || ch1 == '\v' 734 || ch2 == '\f' || ch2=='\r') { 735 if (i2 < name2.length()) { 736 ch2 = name2.charAt(i2 ++); 737 } 738 else { 739 ch2 = 0; 740 } 741 } 742 743 // If we reach the ends of both strings then they match 744 if (ch1 == 0 && ch2 == 0) { 745 return 0; 746 } 747 748 // Case-insensitive comparison 749 if (ch1 != ch2) { 750 result = Character.toLowerCase(ch1) 751 - Character.toLowerCase(ch2); 752 if (result != 0) { 753 return result; 754 } 755 } 756 } 757 } 758 */ 759 760 /** 761 * Get the the maximum values for some enum/int properties. 762 * @return maximum values for the integer properties. 763 */ 764 public int getMaxValues(int column) 765 { 766 // return m_maxBlockScriptValue_; 767 768 switch(column) { 769 case 0: 770 return m_maxBlockScriptValue_; 771 case 2: 772 return m_maxJTGValue_; 773 default: 774 return 0; 775 } 776 } 777 778 /** 779 * Gets the type mask 780 * @param type character type 781 * @return mask 782 */ 783 public static final int getMask(int type) 784 { 785 return 1 << type; 786 } 787 788 789 /** 790 * Returns the digit values of characters like 'A' - 'Z', normal, 791 * half-width and full-width. This method assumes that the other digit 792 * characters are checked by the calling method. 793 * @param ch character to test 794 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise 795 * its corresponding digit will be returned. 796 */ 797 public static int getEuropeanDigit(int ch) { 798 if ((ch > 0x7a && ch < 0xff21) 799 || ch < 0x41 || (ch > 0x5a && ch < 0x61) 800 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { 801 return -1; 802 } 803 if (ch <= 0x7a) { 804 // ch >= 0x41 or ch < 0x61 805 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); 806 } 807 // ch >= 0xff21 808 if (ch <= 0xff3a) { 809 return ch + 10 - 0xff21; 810 } 811 // ch >= 0xff41 && ch <= 0xff5a 812 return ch + 10 - 0xff41; 813 } 814 815 public int digit(int c) { 816 int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; 817 if(value<=9) { 818 return value; 819 } else { 820 return -1; 821 } 822 } 823 824 public int getNumericValue(int c) { 825 // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit() 826 int ntv = getNumericTypeValue(getProperty(c)); 827 828 if(ntv==NTV_NONE_) { 829 return getEuropeanDigit(c); 830 } else if(ntv<NTV_DIGIT_START_) { 831 /* decimal digit */ 832 return ntv-NTV_DECIMAL_START_; 833 } else if(ntv<NTV_NUMERIC_START_) { 834 /* other digit */ 835 return ntv-NTV_DIGIT_START_; 836 } else if(ntv<NTV_FRACTION_START_) { 837 /* small integer */ 838 return ntv-NTV_NUMERIC_START_; 839 } else if(ntv<NTV_LARGE_START_) { 840 /* fraction */ 841 return -2; 842 } else if(ntv<NTV_BASE60_START_) { 843 /* large, single-significant-digit integer */ 844 int mant=(ntv>>5)-14; 845 int exp=(ntv&0x1f)+2; 846 if(exp<9 || (exp==9 && mant<=2)) { 847 int numValue=mant; 848 do { 849 numValue*=10; 850 } while(--exp>0); 851 return numValue; 852 } else { 853 return -2; 854 } 855 } else if(ntv<NTV_FRACTION20_START_) { 856 /* sexagesimal (base 60) integer */ 857 int numValue=(ntv>>2)-0xbf; 858 int exp=(ntv&3)+1; 859 860 switch(exp) { 861 case 4: 862 numValue*=60*60*60*60; 863 break; 864 case 3: 865 numValue*=60*60*60; 866 break; 867 case 2: 868 numValue*=60*60; 869 break; 870 case 1: 871 numValue*=60; 872 break; 873 case 0: 874 default: 875 break; 876 } 877 878 return numValue; 879 } else if(ntv<NTV_RESERVED_START_) { 880 // fraction-20 e.g. 3/80 881 return -2; 882 } else { 883 /* reserved */ 884 return -2; 885 } 886 } 887 888 public double getUnicodeNumericValue(int c) { 889 // equivalent to c version double u_getNumericValue(UChar32 c) 890 int ntv = getNumericTypeValue(getProperty(c)); 891 892 if(ntv==NTV_NONE_) { 893 return UCharacter.NO_NUMERIC_VALUE; 894 } else if(ntv<NTV_DIGIT_START_) { 895 /* decimal digit */ 896 return ntv-NTV_DECIMAL_START_; 897 } else if(ntv<NTV_NUMERIC_START_) { 898 /* other digit */ 899 return ntv-NTV_DIGIT_START_; 900 } else if(ntv<NTV_FRACTION_START_) { 901 /* small integer */ 902 return ntv-NTV_NUMERIC_START_; 903 } else if(ntv<NTV_LARGE_START_) { 904 /* fraction */ 905 int numerator=(ntv>>4)-12; 906 int denominator=(ntv&0xf)+1; 907 return (double)numerator/denominator; 908 } else if(ntv<NTV_BASE60_START_) { 909 /* large, single-significant-digit integer */ 910 double numValue; 911 int mant=(ntv>>5)-14; 912 int exp=(ntv&0x1f)+2; 913 numValue=mant; 914 915 /* multiply by 10^exp without math.h */ 916 while(exp>=4) { 917 numValue*=10000.; 918 exp-=4; 919 } 920 switch(exp) { 921 case 3: 922 numValue*=1000.; 923 break; 924 case 2: 925 numValue*=100.; 926 break; 927 case 1: 928 numValue*=10.; 929 break; 930 case 0: 931 default: 932 break; 933 } 934 935 return numValue; 936 } else if(ntv<NTV_FRACTION20_START_) { 937 /* sexagesimal (base 60) integer */ 938 int numValue=(ntv>>2)-0xbf; 939 int exp=(ntv&3)+1; 940 941 switch(exp) { 942 case 4: 943 numValue*=60*60*60*60; 944 break; 945 case 3: 946 numValue*=60*60*60; 947 break; 948 case 2: 949 numValue*=60*60; 950 break; 951 case 1: 952 numValue*=60; 953 break; 954 case 0: 955 default: 956 break; 957 } 958 959 return numValue; 960 } else if(ntv<NTV_RESERVED_START_) { 961 // fraction-20 e.g. 3/80 962 int frac20=ntv-NTV_FRACTION20_START_; // 0..0x17 963 int numerator=2*(frac20&3)+1; 964 int denominator=20<<(frac20>>2); 965 return (double)numerator/denominator; 966 } else { 967 /* reserved */ 968 return UCharacter.NO_NUMERIC_VALUE; 969 } 970 } 971 972 // protected variables ----------------------------------------------- 973 974 /** 975 * Extra property trie 976 */ 977 Trie2_16 m_additionalTrie_; 978 /** 979 * Extra property vectors, 1st column for age and second for binary 980 * properties. 981 */ 982 int m_additionalVectors_[]; 983 /** 984 * Number of additional columns 985 */ 986 int m_additionalColumnsCount_; 987 /** 988 * Maximum values for block, bits used as in vector word 989 * 0 990 */ 991 int m_maxBlockScriptValue_; 992 /** 993 * Maximum values for script, bits used as in vector word 994 * 0 995 */ 996 int m_maxJTGValue_; 997 998 /** 999 * Script_Extensions data 1000 */ 1001 public char[] m_scriptExtensions_; 1002 1003 // private variables ------------------------------------------------- 1004 1005 /** 1006 * Default name of the datafile 1007 */ 1008 private static final String DATA_FILE_NAME_ = "uprops.icu"; 1009 1010 // property data constants ------------------------------------------------- 1011 1012 /** 1013 * Numeric types and values in the main properties words. 1014 */ 1015 private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; 1016 private static final int getNumericTypeValue(int props) { 1017 return props >> NUMERIC_TYPE_VALUE_SHIFT_; 1018 } 1019 /* constants for the storage form of numeric types and values */ 1020 /** No numeric value. */ 1021 private static final int NTV_NONE_ = 0; 1022 /** Decimal digits: nv=0..9 */ 1023 private static final int NTV_DECIMAL_START_ = 1; 1024 /** Other digits: nv=0..9 */ 1025 private static final int NTV_DIGIT_START_ = 11; 1026 /** Small integers: nv=0..154 */ 1027 private static final int NTV_NUMERIC_START_ = 21; 1028 /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */ 1029 private static final int NTV_FRACTION_START_ = 0xb0; 1030 /** 1031 * Large integers: 1032 * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) 1033 * (only one significant decimal digit) 1034 */ 1035 private static final int NTV_LARGE_START_ = 0x1e0; 1036 /** 1037 * Sexagesimal numbers: 1038 * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) 1039 */ 1040 private static final int NTV_BASE60_START_=0x300; 1041 /** 1042 * Fraction-20 values: 1043 * frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640 1044 * numerator: num = 2*(frac20&3)+1 1045 * denominator: den = 20<<(frac20>>2) 1046 */ 1047 private static final int NTV_FRACTION20_START_ = NTV_BASE60_START_ + 36; // 0x300+9*4=0x324 1048 /** No numeric value (yet). */ 1049 private static final int NTV_RESERVED_START_ = NTV_FRACTION20_START_ + 24; // 0x324+6*4=0x34c 1050 1051 private static final int ntvGetType(int ntv) { 1052 return 1053 (ntv==NTV_NONE_) ? NumericType.NONE : 1054 (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL : 1055 (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT : 1056 NumericType.NUMERIC; 1057 } 1058 1059 /* 1060 * Properties in vector word 0 1061 * Bits 1062 * 31..24 DerivedAge version major/minor one nibble each 1063 * 23..22 3..1: Bits 7..0 = Script_Extensions index 1064 * 3: Script value from Script_Extensions 1065 * 2: Script=Inherited 1066 * 1: Script=Common 1067 * 0: Script=bits 7..0 1068 * 21..20 reserved 1069 * 19..17 East Asian Width 1070 * 16.. 8 UBlockCode 1071 * 7.. 0 UScriptCode 1072 */ 1073 1074 /** 1075 * Script_Extensions: mask includes Script 1076 */ 1077 public static final int SCRIPT_X_MASK = 0x00c000ff; 1078 //private static final int SCRIPT_X_SHIFT = 22; 1079 /** 1080 * Integer properties mask and shift values for East Asian cell width. 1081 * Equivalent to icu4c UPROPS_EA_MASK 1082 */ 1083 private static final int EAST_ASIAN_MASK_ = 0x000e0000; 1084 /** 1085 * Integer properties mask and shift values for East Asian cell width. 1086 * Equivalent to icu4c UPROPS_EA_SHIFT 1087 */ 1088 private static final int EAST_ASIAN_SHIFT_ = 17; 1089 /** 1090 * Integer properties mask and shift values for blocks. 1091 * Equivalent to icu4c UPROPS_BLOCK_MASK 1092 */ 1093 private static final int BLOCK_MASK_ = 0x0001ff00; 1094 /** 1095 * Integer properties mask and shift values for blocks. 1096 * Equivalent to icu4c UPROPS_BLOCK_SHIFT 1097 */ 1098 private static final int BLOCK_SHIFT_ = 8; 1099 /** 1100 * Integer properties mask and shift values for scripts. 1101 * Equivalent to icu4c UPROPS_SHIFT_MASK 1102 */ 1103 public static final int SCRIPT_MASK_ = 0x000000ff; 1104 1105 /* SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */ 1106 public static final int SCRIPT_X_WITH_COMMON = 0x400000; 1107 public static final int SCRIPT_X_WITH_INHERITED = 0x800000; 1108 public static final int SCRIPT_X_WITH_OTHER = 0xc00000; 1109 1110 /** 1111 * Additional properties used in internal trie data 1112 */ 1113 /* 1114 * Properties in vector word 1 1115 * Each bit encodes one binary property. 1116 * The following constants represent the bit number, use 1<<UPROPS_XYZ. 1117 * UPROPS_BINARY_1_TOP<=32! 1118 * 1119 * Keep this list of property enums in sync with 1120 * propListNames[] in icu/source/tools/genprops/props2.c! 1121 * 1122 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". 1123 */ 1124 private static final int WHITE_SPACE_PROPERTY_ = 0; 1125 private static final int DASH_PROPERTY_ = 1; 1126 private static final int HYPHEN_PROPERTY_ = 2; 1127 private static final int QUOTATION_MARK_PROPERTY_ = 3; 1128 private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4; 1129 private static final int MATH_PROPERTY_ = 5; 1130 private static final int HEX_DIGIT_PROPERTY_ = 6; 1131 private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7; 1132 private static final int ALPHABETIC_PROPERTY_ = 8; 1133 private static final int IDEOGRAPHIC_PROPERTY_ = 9; 1134 private static final int DIACRITIC_PROPERTY_ = 10; 1135 private static final int EXTENDER_PROPERTY_ = 11; 1136 private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12; 1137 private static final int GRAPHEME_EXTEND_PROPERTY_ = 13; 1138 private static final int GRAPHEME_LINK_PROPERTY_ = 14; 1139 private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15; 1140 private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16; 1141 private static final int RADICAL_PROPERTY_ = 17; 1142 private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18; 1143 private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19; 1144 private static final int DEPRECATED_PROPERTY_ = 20; 1145 private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21; 1146 private static final int XID_START_PROPERTY_ = 22; 1147 private static final int XID_CONTINUE_PROPERTY_ = 23; 1148 private static final int ID_START_PROPERTY_ = 24; 1149 private static final int ID_CONTINUE_PROPERTY_ = 25; 1150 private static final int GRAPHEME_BASE_PROPERTY_ = 26; 1151 private static final int S_TERM_PROPERTY_ = 27; 1152 private static final int VARIATION_SELECTOR_PROPERTY_ = 28; 1153 private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */ 1154 private static final int PATTERN_WHITE_SPACE = 30; 1155 1156 /* 1157 * Properties in vector word 2 1158 * Bits 1159 * 31..28 http://www.unicode.org/reports/tr51/#Emoji_Properties 1160 * 27..26 reserved 1161 * 25..20 Line Break 1162 * 19..15 Sentence Break 1163 * 14..10 Word Break 1164 * 9.. 5 Grapheme Cluster Break 1165 * 4.. 0 Decomposition Type 1166 */ 1167 private static final int PROPS_2_EMOJI = 28; 1168 private static final int PROPS_2_EMOJI_PRESENTATION = 29; 1169 private static final int PROPS_2_EMOJI_MODIFIER = 30; 1170 private static final int PROPS_2_EMOJI_MODIFIER_BASE = 31; 1171 1172 private static final int LB_MASK = 0x03f00000; 1173 private static final int LB_SHIFT = 20; 1174 1175 private static final int SB_MASK = 0x000f8000; 1176 private static final int SB_SHIFT = 15; 1177 1178 private static final int WB_MASK = 0x00007c00; 1179 private static final int WB_SHIFT = 10; 1180 1181 private static final int GCB_MASK = 0x000003e0; 1182 private static final int GCB_SHIFT = 5; 1183 1184 /** 1185 * Integer properties mask for decomposition type. 1186 * Equivalent to icu4c UPROPS_DT_MASK. 1187 */ 1188 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; 1189 1190 /** 1191 * First nibble shift 1192 */ 1193 private static final int FIRST_NIBBLE_SHIFT_ = 0x4; 1194 /** 1195 * Second nibble mask 1196 */ 1197 private static final int LAST_NIBBLE_MASK_ = 0xF; 1198 /** 1199 * Age value shift 1200 */ 1201 private static final int AGE_SHIFT_ = 24; 1202 1203 1204 // private constructors -------------------------------------------------- 1205 1206 /** 1207 * Constructor 1208 * @exception IOException thrown when data reading fails or data corrupted 1209 */ 1210 private UCharacterProperty() throws IOException 1211 { 1212 // consistency check 1213 if(binProps.length!=UProperty.BINARY_LIMIT) { 1214 throw new ICUException("binProps.length!=UProperty.BINARY_LIMIT"); 1215 } 1216 if(intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)) { 1217 throw new ICUException("intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)"); 1218 } 1219 1220 // jar access 1221 ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_); 1222 m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable()); 1223 // Read or skip the 16 indexes. 1224 int propertyOffset = bytes.getInt(); 1225 /* exceptionOffset = */ bytes.getInt(); 1226 /* caseOffset = */ bytes.getInt(); 1227 int additionalOffset = bytes.getInt(); 1228 int additionalVectorsOffset = bytes.getInt(); 1229 m_additionalColumnsCount_ = bytes.getInt(); 1230 int scriptExtensionsOffset = bytes.getInt(); 1231 int reservedOffset7 = bytes.getInt(); 1232 /* reservedOffset8 = */ bytes.getInt(); 1233 /* dataTopOffset = */ bytes.getInt(); 1234 m_maxBlockScriptValue_ = bytes.getInt(); 1235 m_maxJTGValue_ = bytes.getInt(); 1236 ICUBinary.skipBytes(bytes, (16 - 12) << 2); 1237 1238 // read the main properties trie 1239 m_trie_ = Trie2_16.createFromSerialized(bytes); 1240 int expectedTrieLength = (propertyOffset - 16) * 4; 1241 int trieLength = m_trie_.getSerializedLength(); 1242 if(trieLength > expectedTrieLength) { 1243 throw new IOException("uprops.icu: not enough bytes for main trie"); 1244 } 1245 // skip padding after trie bytes 1246 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 1247 1248 // skip unused intervening data structures 1249 ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); 1250 1251 if(m_additionalColumnsCount_ > 0) { 1252 // reads the additional property block 1253 m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); 1254 expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; 1255 trieLength = m_additionalTrie_.getSerializedLength(); 1256 if(trieLength > expectedTrieLength) { 1257 throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); 1258 } 1259 // skip padding after trie bytes 1260 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 1261 1262 // additional properties 1263 int size = scriptExtensionsOffset - additionalVectorsOffset; 1264 m_additionalVectors_ = ICUBinary.getInts(bytes, size, 0); 1265 } 1266 1267 // Script_Extensions 1268 int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; 1269 if(numChars > 0) { 1270 m_scriptExtensions_ = ICUBinary.getChars(bytes, numChars, 0); 1271 } 1272 } 1273 1274 private static final class IsAcceptable implements ICUBinary.Authenticate { 1275 // @Override when we switch to Java 6 1276 @Override 1277 public boolean isDataVersionAcceptable(byte version[]) { 1278 return version[0] == 7; 1279 } 1280 } 1281 private static final int DATA_FORMAT = 0x5550726F; // "UPro" 1282 1283 // private methods ------------------------------------------------------- 1284 1285 /* 1286 * Compare additional properties to see if it has argument type 1287 * @param property 32 bit properties 1288 * @param type character type 1289 * @return true if property has type 1290 */ 1291 /*private boolean compareAdditionalType(int property, int type) 1292 { 1293 return (property & (1 << type)) != 0; 1294 }*/ 1295 1296 // property starts for UnicodeSet -------------------------------------- *** 1297 1298 private static final int TAB = 0x0009; 1299 //private static final int LF = 0x000a; 1300 //private static final int FF = 0x000c; 1301 private static final int CR = 0x000d; 1302 private static final int U_A = 0x0041; 1303 private static final int U_F = 0x0046; 1304 private static final int U_Z = 0x005a; 1305 private static final int U_a = 0x0061; 1306 private static final int U_f = 0x0066; 1307 private static final int U_z = 0x007a; 1308 private static final int DEL = 0x007f; 1309 private static final int NL = 0x0085; 1310 private static final int NBSP = 0x00a0; 1311 private static final int CGJ = 0x034f; 1312 private static final int FIGURESP= 0x2007; 1313 private static final int HAIRSP = 0x200a; 1314 //private static final int ZWNJ = 0x200c; 1315 //private static final int ZWJ = 0x200d; 1316 private static final int RLM = 0x200f; 1317 private static final int NNBSP = 0x202f; 1318 private static final int WJ = 0x2060; 1319 private static final int INHSWAP = 0x206a; 1320 private static final int NOMDIG = 0x206f; 1321 private static final int U_FW_A = 0xff21; 1322 private static final int U_FW_F = 0xff26; 1323 private static final int U_FW_Z = 0xff3a; 1324 private static final int U_FW_a = 0xff41; 1325 private static final int U_FW_f = 0xff46; 1326 private static final int U_FW_z = 0xff5a; 1327 private static final int ZWNBSP = 0xfeff; 1328 1329 public UnicodeSet addPropertyStarts(UnicodeSet set) { 1330 /* add the start code point of each same-value range of the main trie */ 1331 Iterator<Trie2.Range> trieIterator = m_trie_.iterator(); 1332 Trie2.Range range; 1333 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 1334 set.add(range.startCodePoint); 1335 } 1336 1337 /* add code points with hardcoded properties, plus the ones following them */ 1338 1339 /* add for u_isblank() */ 1340 set.add(TAB); 1341 set.add(TAB+1); 1342 1343 /* add for IS_THAT_CONTROL_SPACE() */ 1344 set.add(CR+1); /* range TAB..CR */ 1345 set.add(0x1c); 1346 set.add(0x1f+1); 1347 set.add(NL); 1348 set.add(NL+1); 1349 1350 /* add for u_isIDIgnorable() what was not added above */ 1351 set.add(DEL); /* range DEL..NBSP-1, NBSP added below */ 1352 set.add(HAIRSP); 1353 set.add(RLM+1); 1354 set.add(INHSWAP); 1355 set.add(NOMDIG+1); 1356 set.add(ZWNBSP); 1357 set.add(ZWNBSP+1); 1358 1359 /* add no-break spaces for u_isWhitespace() what was not added above */ 1360 set.add(NBSP); 1361 set.add(NBSP+1); 1362 set.add(FIGURESP); 1363 set.add(FIGURESP+1); 1364 set.add(NNBSP); 1365 set.add(NNBSP+1); 1366 1367 /* add for u_charDigitValue() */ 1368 // TODO remove when UCharacter.getHanNumericValue() is changed to just return 1369 // Unicode numeric values 1370 set.add(0x3007); 1371 set.add(0x3008); 1372 set.add(0x4e00); 1373 set.add(0x4e01); 1374 set.add(0x4e8c); 1375 set.add(0x4e8d); 1376 set.add(0x4e09); 1377 set.add(0x4e0a); 1378 set.add(0x56db); 1379 set.add(0x56dc); 1380 set.add(0x4e94); 1381 set.add(0x4e95); 1382 set.add(0x516d); 1383 set.add(0x516e); 1384 set.add(0x4e03); 1385 set.add(0x4e04); 1386 set.add(0x516b); 1387 set.add(0x516c); 1388 set.add(0x4e5d); 1389 set.add(0x4e5e); 1390 1391 /* add for u_digit() */ 1392 set.add(U_a); 1393 set.add(U_z+1); 1394 set.add(U_A); 1395 set.add(U_Z+1); 1396 set.add(U_FW_a); 1397 set.add(U_FW_z+1); 1398 set.add(U_FW_A); 1399 set.add(U_FW_Z+1); 1400 1401 /* add for u_isxdigit() */ 1402 set.add(U_f+1); 1403 set.add(U_F+1); 1404 set.add(U_FW_f+1); 1405 set.add(U_FW_F+1); 1406 1407 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ 1408 set.add(WJ); /* range WJ..NOMDIG */ 1409 set.add(0xfff0); 1410 set.add(0xfffb+1); 1411 set.add(0xe0000); 1412 set.add(0xe0fff+1); 1413 1414 /* add for UCHAR_GRAPHEME_BASE and others */ 1415 set.add(CGJ); 1416 set.add(CGJ+1); 1417 1418 return set; // for chaining 1419 } 1420 1421 public void upropsvec_addPropertyStarts(UnicodeSet set) { 1422 /* add the start code point of each same-value range of the properties vectors trie */ 1423 if(m_additionalColumnsCount_>0) { 1424 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ 1425 Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator(); 1426 Trie2.Range range; 1427 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 1428 set.add(range.startCodePoint); 1429 } 1430 } 1431 } 1432 1433 // This static initializer block must be placed after 1434 // other static member initialization 1435 static { 1436 try { 1437 INSTANCE = new UCharacterProperty(); 1438 } 1439 catch (IOException e) { 1440 throw new MissingResourceException(e.getMessage(),"",""); 1441 } 1442 } 1443 1444 /*---------------------------------------------------------------- 1445 * Inclusions list 1446 *----------------------------------------------------------------*/ 1447 1448 /* 1449 * Return a set of characters for property enumeration. 1450 * The set implicitly contains 0x110000 as well, which is one more than the highest 1451 * Unicode code point. 1452 * 1453 * This set is used as an ordered list - its code points are ordered, and 1454 * consecutive code points (in Unicode code point order) in the set define a range. 1455 * For each two consecutive characters (start, limit) in the set, 1456 * all of the UCD/normalization and related properties for 1457 * all code points start..limit-1 are all the same, 1458 * except for character names and ISO comments. 1459 * 1460 * All Unicode code points U+0000..U+10ffff are covered by these ranges. 1461 * The ranges define a partition of the Unicode code space. 1462 * ICU uses the inclusions set to enumerate properties for generating 1463 * UnicodeSets containing all code points that have a certain property value. 1464 * 1465 * The Inclusion List is generated from the UCD. It is generated 1466 * by enumerating the data tries, and code points for hardcoded properties 1467 * are added as well. 1468 * 1469 * -------------------------------------------------------------------------- 1470 * 1471 * The following are ideas for getting properties-unique code point ranges, 1472 * with possible optimizations beyond the current implementation. 1473 * These optimizations would require more code and be more fragile. 1474 * The current implementation generates one single list (set) for all properties. 1475 * 1476 * To enumerate properties efficiently, one needs to know ranges of 1477 * repetitive values, so that the value of only each start code point 1478 * can be applied to the whole range. 1479 * This information is in principle available in the uprops.icu/unorm.icu data. 1480 * 1481 * There are two obstacles: 1482 * 1483 * 1. Some properties are computed from multiple data structures, 1484 * making it necessary to get repetitive ranges by intersecting 1485 * ranges from multiple tries. 1486 * 1487 * 2. It is not economical to write code for getting repetitive ranges 1488 * that are precise for each of some 50 properties. 1489 * 1490 * Compromise ideas: 1491 * 1492 * - Get ranges per trie, not per individual property. 1493 * Each range contains the same values for a whole group of properties. 1494 * This would generate currently five range sets, two for uprops.icu tries 1495 * and three for unorm.icu tries. 1496 * 1497 * - Combine sets of ranges for multiple tries to get sufficient sets 1498 * for properties, e.g., the uprops.icu main and auxiliary tries 1499 * for all non-normalization properties. 1500 * 1501 * Ideas for representing ranges and combining them: 1502 * 1503 * - A UnicodeSet could hold just the start code points of ranges. 1504 * Multiple sets are easily combined by or-ing them together. 1505 * 1506 * - Alternatively, a UnicodeSet could hold each even-numbered range. 1507 * All ranges could be enumerated by using each start code point 1508 * (for the even-numbered ranges) as well as each limit (end+1) code point 1509 * (for the odd-numbered ranges). 1510 * It should be possible to combine two such sets by xor-ing them, 1511 * but no more than two. 1512 * 1513 * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays, 1514 * but the first one is certainly simpler and applicable for combining more than 1515 * two range sets. 1516 * 1517 * It is possible to combine all range sets for all uprops/unorm tries into one 1518 * set that can be used for all properties. 1519 * As an optimization, there could be less-combined range sets for certain 1520 * groups of properties. 1521 * The relationship of which less-combined range set to use for which property 1522 * depends on the implementation of the properties and must be hardcoded 1523 * - somewhat error-prone and higher maintenance but can be tested easily 1524 * by building property sets "the simple way" in test code. 1525 * 1526 * --- 1527 * 1528 * Do not use a UnicodeSet pattern because that causes infinite recursion; 1529 * UnicodeSet depends on the inclusions set. 1530 * 1531 * --- 1532 * 1533 * getInclusions() is commented out starting 2005-feb-12 because 1534 * UnicodeSet now calls the uxyz_addPropertyStarts() directly, 1535 * and only for the relevant property source. 1536 */ 1537 /* 1538 public UnicodeSet getInclusions() { 1539 UnicodeSet set = new UnicodeSet(); 1540 NormalizerImpl.addPropertyStarts(set); 1541 addPropertyStarts(set); 1542 return set; 1543 } 1544 */ 1545 } 1546