1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.impl; 11 12 import java.io.IOException; 13 import java.nio.ByteBuffer; 14 import java.util.Iterator; 15 import java.util.MissingResourceException; 16 17 import com.ibm.icu.lang.UCharacter; 18 import com.ibm.icu.lang.UCharacter.HangulSyllableType; 19 import com.ibm.icu.lang.UCharacter.NumericType; 20 import com.ibm.icu.lang.UCharacterCategory; 21 import com.ibm.icu.lang.UProperty; 22 import com.ibm.icu.lang.UScript; 23 import com.ibm.icu.text.Normalizer2; 24 import com.ibm.icu.text.UTF16; 25 import com.ibm.icu.text.UnicodeSet; 26 import com.ibm.icu.util.ICUException; 27 import com.ibm.icu.util.VersionInfo; 28 29 /** 30 * <p>Internal class used for Unicode character property database.</p> 31 * <p>This classes store binary data read from uprops.icu. 32 * It does not have the capability to parse the data into more high-level 33 * information. It only returns bytes of information when required.</p> 34 * <p>Due to the form most commonly used for retrieval, array of char is used 35 * to store the binary data.</p> 36 * <p>UCharacterPropertyDB also contains information on accessing indexes to 37 * significant points in the binary data.</p> 38 * <p>Responsibility for molding the binary data into more meaning form lies on 39 * <a href=UCharacter.html>UCharacter</a>.</p> 40 * @author Syn Wee Quek 41 * @since release 2.1, february 1st 2002 42 */ 43 44 public final class UCharacterProperty 45 { 46 // public data members ----------------------------------------------- 47 48 /* 49 * public singleton instance 50 */ 51 public static final UCharacterProperty INSTANCE; 52 53 /** 54 * Trie data 55 */ 56 public Trie2_16 m_trie_; 57 /** 58 * Unicode version 59 */ 60 public VersionInfo m_unicodeVersion_; 61 /** 62 * Latin capital letter i with dot above 63 */ 64 public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130; 65 /** 66 * Latin small letter i with dot above 67 */ 68 public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131; 69 /** 70 * Latin lowercase i 71 */ 72 public static final char LATIN_SMALL_LETTER_I_ = 0x69; 73 /** 74 * Character type mask 75 */ 76 public static final int TYPE_MASK = 0x1F; 77 78 // uprops.h enum UPropertySource --------------------------------------- *** 79 80 /** No source, not a supported property. */ 81 public static final int SRC_NONE=0; 82 /** From uchar.c/uprops.icu main trie */ 83 public static final int SRC_CHAR=1; 84 /** From uchar.c/uprops.icu properties vectors trie */ 85 public static final int SRC_PROPSVEC=2; 86 /** From unames.c/unames.icu */ 87 public static final int SRC_NAMES=3; 88 /** From ucase.c/ucase.icu */ 89 public static final int SRC_CASE=4; 90 /** From ubidi_props.c/ubidi.icu */ 91 public static final int SRC_BIDI=5; 92 /** From uchar.c/uprops.icu main trie as well as properties vectors trie */ 93 public static final int SRC_CHAR_AND_PROPSVEC=6; 94 /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */ 95 public static final int SRC_CASE_AND_NORM=7; 96 /** From normalizer2impl.cpp/nfc.nrm */ 97 public static final int SRC_NFC=8; 98 /** From normalizer2impl.cpp/nfkc.nrm */ 99 public static final int SRC_NFKC=9; 100 /** From normalizer2impl.cpp/nfkc_cf.nrm */ 101 public static final int SRC_NFKC_CF=10; 102 /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */ 103 public static final int SRC_NFC_CANON_ITER=11; 104 /** One more than the highest UPropertySource (SRC_) constant. */ 105 public static final int SRC_COUNT=12; 106 107 // public methods ---------------------------------------------------- 108 109 /** 110 * Gets the main property value for code point ch. 111 * @param ch code point whose property value is to be retrieved 112 * @return property value of code point 113 */ 114 public final int getProperty(int ch) 115 { 116 return m_trie_.get(ch); 117 } 118 119 /** 120 * Gets the unicode additional properties. 121 * Java version of C u_getUnicodeProperties(). 122 * @param codepoint codepoint whose additional properties is to be 123 * retrieved 124 * @param column The column index. 125 * @return unicode properties 126 */ 127 public int getAdditional(int codepoint, int column) { 128 assert column >= 0; 129 if (column >= m_additionalColumnsCount_) { 130 return 0; 131 } 132 return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; 133 } 134 135 static final int MY_MASK = UCharacterProperty.TYPE_MASK 136 & ((1<<UCharacterCategory.UPPERCASE_LETTER) | 137 (1<<UCharacterCategory.LOWERCASE_LETTER) | 138 (1<<UCharacterCategory.TITLECASE_LETTER) | 139 (1<<UCharacterCategory.MODIFIER_LETTER) | 140 (1<<UCharacterCategory.OTHER_LETTER)); 141 142 143 /** 144 * <p>Get the "age" of the code point.</p> 145 * <p>The "age" is the Unicode version when the code point was first 146 * designated (as a non-character or for Private Use) or assigned a 147 * character.</p> 148 * <p>This can be useful to avoid emitting code points to receiving 149 * processes that do not accept newer characters.</p> 150 * <p>The data is from the UCD file DerivedAge.txt.</p> 151 * <p>This API does not check the validity of the codepoint.</p> 152 * @param codepoint The code point. 153 * @return the Unicode version number 154 */ 155 public VersionInfo getAge(int codepoint) 156 { 157 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; 158 return VersionInfo.getInstance( 159 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, 160 version & LAST_NIBBLE_MASK_, 0, 0); 161 } 162 163 private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED); 164 private static final int GC_CC_MASK = getMask(UCharacter.CONTROL); 165 private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE); 166 private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR); 167 private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR); 168 private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR); 169 /** Mask constant for multiple UCharCategory bits (Z Separators). */ 170 private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK; 171 172 /** 173 * Checks if c is in 174 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] 175 * with space=\p{Whitespace} and Control=Cc. 176 * Implements UCHAR_POSIX_GRAPH. 177 * @internal 178 */ 179 private static final boolean isgraphPOSIX(int c) { 180 /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ 181 /* comparing ==0 returns FALSE for the categories mentioned */ 182 return (getMask(UCharacter.getType(c))& 183 (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK)) 184 ==0; 185 } 186 187 // binary properties --------------------------------------------------- *** 188 189 private class BinaryProperty { 190 int column; // SRC_PROPSVEC column, or "source" if mask==0 191 int mask; 192 BinaryProperty(int column, int mask) { 193 this.column=column; 194 this.mask=mask; 195 } 196 BinaryProperty(int source) { 197 this.column=source; 198 this.mask=0; 199 } 200 final int getSource() { 201 return mask==0 ? column : SRC_PROPSVEC; 202 } 203 boolean contains(int c) { 204 // systematic, directly stored properties 205 return (getAdditional(c, column)&mask)!=0; 206 } 207 } 208 209 private class CaseBinaryProperty extends BinaryProperty { // case mapping properties 210 int which; 211 CaseBinaryProperty(int which) { 212 super(SRC_CASE); 213 this.which=which; 214 } 215 @Override 216 boolean contains(int c) { 217 return UCaseProps.INSTANCE.hasBinaryProperty(c, which); 218 } 219 } 220 221 private class NormInertBinaryProperty extends BinaryProperty { // UCHAR_NF*_INERT properties 222 int which; 223 NormInertBinaryProperty(int source, int which) { 224 super(source); 225 this.which=which; 226 } 227 @Override 228 boolean contains(int c) { 229 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c); 230 } 231 } 232 233 BinaryProperty[] binProps={ 234 /* 235 * Binary-property implementations must be in order of corresponding UProperty, 236 * and there must be exactly one entry per binary UProperty. 237 */ 238 new BinaryProperty(1, (1<<ALPHABETIC_PROPERTY_)), 239 new BinaryProperty(1, (1<<ASCII_HEX_DIGIT_PROPERTY_)), 240 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_CONTROL 241 @Override 242 boolean contains(int c) { 243 return UBiDiProps.INSTANCE.isBidiControl(c); 244 } 245 }, 246 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_MIRRORED 247 @Override 248 boolean contains(int c) { 249 return UBiDiProps.INSTANCE.isMirrored(c); 250 } 251 }, 252 new BinaryProperty(1, (1<<DASH_PROPERTY_)), 253 new BinaryProperty(1, (1<<DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_)), 254 new BinaryProperty(1, (1<<DEPRECATED_PROPERTY_)), 255 new BinaryProperty(1, (1<<DIACRITIC_PROPERTY_)), 256 new BinaryProperty(1, (1<<EXTENDER_PROPERTY_)), 257 new BinaryProperty(SRC_NFC) { // UCHAR_FULL_COMPOSITION_EXCLUSION 258 @Override 259 boolean contains(int c) { 260 // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. 261 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl; 262 return impl.isCompNo(impl.getNorm16(c)); 263 } 264 }, 265 new BinaryProperty(1, (1<<GRAPHEME_BASE_PROPERTY_)), 266 new BinaryProperty(1, (1<<GRAPHEME_EXTEND_PROPERTY_)), 267 new BinaryProperty(1, (1<<GRAPHEME_LINK_PROPERTY_)), 268 new BinaryProperty(1, (1<<HEX_DIGIT_PROPERTY_)), 269 new BinaryProperty(1, (1<<HYPHEN_PROPERTY_)), 270 new BinaryProperty(1, (1<<ID_CONTINUE_PROPERTY_)), 271 new BinaryProperty(1, (1<<ID_START_PROPERTY_)), 272 new BinaryProperty(1, (1<<IDEOGRAPHIC_PROPERTY_)), 273 new BinaryProperty(1, (1<<IDS_BINARY_OPERATOR_PROPERTY_)), 274 new BinaryProperty(1, (1<<IDS_TRINARY_OPERATOR_PROPERTY_)), 275 new BinaryProperty(SRC_BIDI) { // UCHAR_JOIN_CONTROL 276 @Override 277 boolean contains(int c) { 278 return UBiDiProps.INSTANCE.isJoinControl(c); 279 } 280 }, 281 new BinaryProperty(1, (1<<LOGICAL_ORDER_EXCEPTION_PROPERTY_)), 282 new CaseBinaryProperty(UProperty.LOWERCASE), 283 new BinaryProperty(1, (1<<MATH_PROPERTY_)), 284 new BinaryProperty(1, (1<<NONCHARACTER_CODE_POINT_PROPERTY_)), 285 new BinaryProperty(1, (1<<QUOTATION_MARK_PROPERTY_)), 286 new BinaryProperty(1, (1<<RADICAL_PROPERTY_)), 287 new CaseBinaryProperty(UProperty.SOFT_DOTTED), 288 new BinaryProperty(1, (1<<TERMINAL_PUNCTUATION_PROPERTY_)), 289 new BinaryProperty(1, (1<<UNIFIED_IDEOGRAPH_PROPERTY_)), 290 new CaseBinaryProperty(UProperty.UPPERCASE), 291 new BinaryProperty(1, (1<<WHITE_SPACE_PROPERTY_)), 292 new BinaryProperty(1, (1<<XID_CONTINUE_PROPERTY_)), 293 new BinaryProperty(1, (1<<XID_START_PROPERTY_)), 294 new CaseBinaryProperty(UProperty.CASE_SENSITIVE), 295 new BinaryProperty(1, (1<<S_TERM_PROPERTY_)), 296 new BinaryProperty(1, (1<<VARIATION_SELECTOR_PROPERTY_)), 297 new NormInertBinaryProperty(SRC_NFC, UProperty.NFD_INERT), 298 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKD_INERT), 299 new NormInertBinaryProperty(SRC_NFC, UProperty.NFC_INERT), 300 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKC_INERT), 301 new BinaryProperty(SRC_NFC_CANON_ITER) { // UCHAR_SEGMENT_STARTER 302 @Override 303 boolean contains(int c) { 304 return Norm2AllModes.getNFCInstance().impl. 305 ensureCanonIterData().isCanonSegmentStarter(c); 306 } 307 }, 308 new BinaryProperty(1, (1<<PATTERN_SYNTAX)), 309 new BinaryProperty(1, (1<<PATTERN_WHITE_SPACE)), 310 new BinaryProperty(SRC_CHAR_AND_PROPSVEC) { // UCHAR_POSIX_ALNUM 311 @Override 312 boolean contains(int c) { 313 return UCharacter.isUAlphabetic(c) || UCharacter.isDigit(c); 314 } 315 }, 316 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_BLANK 317 @Override 318 boolean contains(int c) { 319 // "horizontal space" 320 if(c<=0x9f) { 321 return c==9 || c==0x20; /* TAB or SPACE */ 322 } else { 323 /* Zs */ 324 return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR; 325 } 326 } 327 }, 328 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_GRAPH 329 @Override 330 boolean contains(int c) { 331 return isgraphPOSIX(c); 332 } 333 }, 334 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_PRINT 335 @Override 336 boolean contains(int c) { 337 /* 338 * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}. 339 * 340 * The only cntrl character in graph+blank is TAB (in blank). 341 * Here we implement (blank-TAB)=Zs instead of calling u_isblank(). 342 */ 343 return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c); 344 } 345 }, 346 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_XDIGIT 347 @Override 348 boolean contains(int c) { 349 /* check ASCII and Fullwidth ASCII a-fA-F */ 350 if( 351 (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) || 352 (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41)) 353 ) { 354 return true; 355 } 356 return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER; 357 } 358 }, 359 new CaseBinaryProperty(UProperty.CASED), 360 new CaseBinaryProperty(UProperty.CASE_IGNORABLE), 361 new CaseBinaryProperty(UProperty.CHANGES_WHEN_LOWERCASED), 362 new CaseBinaryProperty(UProperty.CHANGES_WHEN_UPPERCASED), 363 new CaseBinaryProperty(UProperty.CHANGES_WHEN_TITLECASED), 364 new BinaryProperty(SRC_CASE_AND_NORM) { // UCHAR_CHANGES_WHEN_CASEFOLDED 365 @Override 366 boolean contains(int c) { 367 String nfd=Norm2AllModes.getNFCInstance().impl.getDecomposition(c); 368 if(nfd!=null) { 369 /* c has a decomposition */ 370 c=nfd.codePointAt(0); 371 if(Character.charCount(c)!=nfd.length()) { 372 /* multiple code points */ 373 c=-1; 374 } 375 } else if(c<0) { 376 return false; /* protect against bad input */ 377 } 378 if(c>=0) { 379 /* single code point */ 380 UCaseProps csp=UCaseProps.INSTANCE; 381 UCaseProps.dummyStringBuilder.setLength(0); 382 return csp.toFullFolding(c, UCaseProps.dummyStringBuilder, 383 UCharacter.FOLD_CASE_DEFAULT)>=0; 384 } else { 385 String folded=UCharacter.foldCase(nfd, true); 386 return !folded.equals(nfd); 387 } 388 } 389 }, 390 new CaseBinaryProperty(UProperty.CHANGES_WHEN_CASEMAPPED), 391 new BinaryProperty(SRC_NFKC_CF) { // UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED 392 @Override 393 boolean contains(int c) { 394 Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstance().impl; 395 String src=UTF16.valueOf(c); 396 StringBuilder dest=new StringBuilder(); 397 // Small destCapacity for NFKC_CF(c). 398 Normalizer2Impl.ReorderingBuffer buffer=new Normalizer2Impl.ReorderingBuffer(kcf, dest, 5); 399 kcf.compose(src, 0, src.length(), false, true, buffer); 400 return !Normalizer2Impl.UTF16Plus.equal(dest, src); 401 } 402 }, 403 new BinaryProperty(2, 1<<PROPS_2_EMOJI), 404 new BinaryProperty(2, 1<<PROPS_2_EMOJI_PRESENTATION), 405 new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER), 406 new BinaryProperty(2, 1<<PROPS_2_EMOJI_MODIFIER_BASE), 407 new BinaryProperty(2, 1<<PROPS_2_EMOJI_COMPONENT), 408 new BinaryProperty(SRC_PROPSVEC) { // REGIONAL_INDICATOR 409 // Property starts are a subset of lb=RI etc. 410 @Override 411 boolean contains(int c) { 412 return 0x1F1E6<=c && c<=0x1F1FF; 413 } 414 }, 415 new BinaryProperty(1, 1<<PREPENDED_CONCATENATION_MARK), 416 }; 417 418 public boolean hasBinaryProperty(int c, int which) { 419 if(which<UProperty.BINARY_START || UProperty.BINARY_LIMIT<=which) { 420 // not a known binary property 421 return false; 422 } else { 423 return binProps[which].contains(c); 424 } 425 } 426 427 // int-value and enumerated properties --------------------------------- *** 428 429 public int getType(int c) { 430 return getProperty(c)&TYPE_MASK; 431 } 432 433 /* 434 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. 435 * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. 436 */ 437 private static final int /* UHangulSyllableType */ gcbToHst[]={ 438 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ 439 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ 440 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ 441 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ 442 HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ 443 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ 444 HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ 445 HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ 446 HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ 447 HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ 448 /* 449 * Omit GCB values beyond what we need for hst. 450 * The code below checks for the array length. 451 */ 452 }; 453 454 private class IntProperty { 455 int column; // SRC_PROPSVEC column, or "source" if mask==0 456 int mask; 457 int shift; 458 IntProperty(int column, int mask, int shift) { 459 this.column=column; 460 this.mask=mask; 461 this.shift=shift; 462 } 463 IntProperty(int source) { 464 this.column=source; 465 this.mask=0; 466 } 467 final int getSource() { 468 return mask==0 ? column : SRC_PROPSVEC; 469 } 470 int getValue(int c) { 471 // systematic, directly stored properties 472 return (getAdditional(c, column)&mask)>>>shift; 473 } 474 int getMaxValue(int which) { 475 return (getMaxValues(column)&mask)>>>shift; 476 } 477 } 478 479 private class BiDiIntProperty extends IntProperty { 480 BiDiIntProperty() { 481 super(SRC_BIDI); 482 } 483 @Override 484 int getMaxValue(int which) { 485 return UBiDiProps.INSTANCE.getMaxValue(which); 486 } 487 } 488 489 private class CombiningClassIntProperty extends IntProperty { 490 CombiningClassIntProperty(int source) { 491 super(source); 492 } 493 @Override 494 int getMaxValue(int which) { 495 return 0xff; 496 } 497 } 498 499 private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties 500 int which; 501 int max; 502 NormQuickCheckIntProperty(int source, int which, int max) { 503 super(source); 504 this.which=which; 505 this.max=max; 506 } 507 @Override 508 int getValue(int c) { 509 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_QUICK_CHECK).getQuickCheck(c); 510 } 511 @Override 512 int getMaxValue(int which) { 513 return max; 514 } 515 } 516 517 IntProperty intProps[]={ 518 new BiDiIntProperty() { // BIDI_CLASS 519 @Override 520 int getValue(int c) { 521 return UBiDiProps.INSTANCE.getClass(c); 522 } 523 }, 524 new IntProperty(0, BLOCK_MASK_, BLOCK_SHIFT_), 525 new CombiningClassIntProperty(SRC_NFC) { // CANONICAL_COMBINING_CLASS 526 @Override 527 int getValue(int c) { 528 return Normalizer2.getNFDInstance().getCombiningClass(c); 529 } 530 }, 531 new IntProperty(2, DECOMPOSITION_TYPE_MASK_, 0), 532 new IntProperty(0, EAST_ASIAN_MASK_, EAST_ASIAN_SHIFT_), 533 new IntProperty(SRC_CHAR) { // GENERAL_CATEGORY 534 @Override 535 int getValue(int c) { 536 return getType(c); 537 } 538 @Override 539 int getMaxValue(int which) { 540 return UCharacterCategory.CHAR_CATEGORY_COUNT-1; 541 } 542 }, 543 new BiDiIntProperty() { // JOINING_GROUP 544 @Override 545 int getValue(int c) { 546 return UBiDiProps.INSTANCE.getJoiningGroup(c); 547 } 548 }, 549 new BiDiIntProperty() { // JOINING_TYPE 550 @Override 551 int getValue(int c) { 552 return UBiDiProps.INSTANCE.getJoiningType(c); 553 } 554 }, 555 new IntProperty(2, LB_MASK, LB_SHIFT), // LINE_BREAK 556 new IntProperty(SRC_CHAR) { // NUMERIC_TYPE 557 @Override 558 int getValue(int c) { 559 return ntvGetType(getNumericTypeValue(getProperty(c))); 560 } 561 @Override 562 int getMaxValue(int which) { 563 return NumericType.COUNT-1; 564 } 565 }, 566 new IntProperty(0, SCRIPT_MASK_, 0) { 567 @Override 568 int getValue(int c) { 569 return UScript.getScript(c); 570 } 571 }, 572 new IntProperty(SRC_PROPSVEC) { // HANGUL_SYLLABLE_TYPE 573 @Override 574 int getValue(int c) { 575 /* see comments on gcbToHst[] above */ 576 int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT; 577 if(gcb<gcbToHst.length) { 578 return gcbToHst[gcb]; 579 } else { 580 return HangulSyllableType.NOT_APPLICABLE; 581 } 582 } 583 @Override 584 int getMaxValue(int which) { 585 return HangulSyllableType.COUNT-1; 586 } 587 }, 588 // max=1=YES -- these are never "maybe", only "no" or "yes" 589 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFD_QUICK_CHECK, 1), 590 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKD_QUICK_CHECK, 1), 591 // max=2=MAYBE 592 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFC_QUICK_CHECK, 2), 593 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKC_QUICK_CHECK, 2), 594 new CombiningClassIntProperty(SRC_NFC) { // LEAD_CANONICAL_COMBINING_CLASS 595 @Override 596 int getValue(int c) { 597 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)>>8; 598 } 599 }, 600 new CombiningClassIntProperty(SRC_NFC) { // TRAIL_CANONICAL_COMBINING_CLASS 601 @Override 602 int getValue(int c) { 603 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)&0xff; 604 } 605 }, 606 new IntProperty(2, GCB_MASK, GCB_SHIFT), // GRAPHEME_CLUSTER_BREAK 607 new IntProperty(2, SB_MASK, SB_SHIFT), // SENTENCE_BREAK 608 new IntProperty(2, WB_MASK, WB_SHIFT), // WORD_BREAK 609 new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE 610 @Override 611 int getValue(int c) { 612 return UBiDiProps.INSTANCE.getPairedBracketType(c); 613 } 614 }, 615 }; 616 617 public int getIntPropertyValue(int c, int which) { 618 if(which<UProperty.INT_START) { 619 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) { 620 return binProps[which].contains(c) ? 1 : 0; 621 } 622 } else if(which<UProperty.INT_LIMIT) { 623 return intProps[which-UProperty.INT_START].getValue(c); 624 } else if (which == UProperty.GENERAL_CATEGORY_MASK) { 625 return getMask(getType(c)); 626 } 627 return 0; // undefined 628 } 629 630 public int getIntPropertyMaxValue(int which) { 631 if(which<UProperty.INT_START) { 632 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) { 633 return 1; // maximum TRUE for all binary properties 634 } 635 } else if(which<UProperty.INT_LIMIT) { 636 return intProps[which-UProperty.INT_START].getMaxValue(which); 637 } 638 return -1; // undefined 639 } 640 641 public final int getSource(int which) { 642 if(which<UProperty.BINARY_START) { 643 return SRC_NONE; /* undefined */ 644 } else if(which<UProperty.BINARY_LIMIT) { 645 return binProps[which].getSource(); 646 } else if(which<UProperty.INT_START) { 647 return SRC_NONE; /* undefined */ 648 } else if(which<UProperty.INT_LIMIT) { 649 return intProps[which-UProperty.INT_START].getSource(); 650 } else if(which<UProperty.STRING_START) { 651 switch(which) { 652 case UProperty.GENERAL_CATEGORY_MASK: 653 case UProperty.NUMERIC_VALUE: 654 return SRC_CHAR; 655 656 default: 657 return SRC_NONE; 658 } 659 } else if(which<UProperty.STRING_LIMIT) { 660 switch(which) { 661 case UProperty.AGE: 662 return SRC_PROPSVEC; 663 664 case UProperty.BIDI_MIRRORING_GLYPH: 665 return SRC_BIDI; 666 667 case UProperty.CASE_FOLDING: 668 case UProperty.LOWERCASE_MAPPING: 669 case UProperty.SIMPLE_CASE_FOLDING: 670 case UProperty.SIMPLE_LOWERCASE_MAPPING: 671 case UProperty.SIMPLE_TITLECASE_MAPPING: 672 case UProperty.SIMPLE_UPPERCASE_MAPPING: 673 case UProperty.TITLECASE_MAPPING: 674 case UProperty.UPPERCASE_MAPPING: 675 return SRC_CASE; 676 677 case UProperty.ISO_COMMENT: 678 case UProperty.NAME: 679 case UProperty.UNICODE_1_NAME: 680 return SRC_NAMES; 681 682 default: 683 return SRC_NONE; 684 } 685 } else { 686 switch(which) { 687 case UProperty.SCRIPT_EXTENSIONS: 688 return SRC_PROPSVEC; 689 default: 690 return SRC_NONE; /* undefined */ 691 } 692 } 693 } 694 695 /** 696 * <p> 697 * Unicode property names and property value names are compared 698 * "loosely". Property[Value]Aliases.txt say: 699 * <quote> 700 * "With loose matching of property names, the case distinctions, 701 * whitespace, and '_' are ignored." 702 * </quote> 703 * </p> 704 * <p> 705 * This function does just that, for ASCII (char *) name strings. 706 * It is almost identical to ucnv_compareNames() but also ignores 707 * ASCII White_Space characters (U+0009..U+000d). 708 * </p> 709 * @param name1 name to compare 710 * @param name2 name to compare 711 * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0 712 * if name1 is greater than name2. 713 */ 714 /* to be implemented in 2.4 715 * public static int comparePropertyNames(String name1, String name2) 716 { 717 int result = 0; 718 int i1 = 0; 719 int i2 = 0; 720 while (true) { 721 char ch1 = 0; 722 char ch2 = 0; 723 // Ignore delimiters '-', '_', and ASCII White_Space 724 if (i1 < name1.length()) { 725 ch1 = name1.charAt(i1 ++); 726 } 727 while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t' 728 || ch1 == '\n' // synwee what is || ch1 == '\v' 729 || ch1 == '\f' || ch1=='\r') { 730 if (i1 < name1.length()) { 731 ch1 = name1.charAt(i1 ++); 732 } 733 else { 734 ch1 = 0; 735 } 736 } 737 if (i2 < name2.length()) { 738 ch2 = name2.charAt(i2 ++); 739 } 740 while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t' 741 || ch2 == '\n' // synwee what is || ch1 == '\v' 742 || ch2 == '\f' || ch2=='\r') { 743 if (i2 < name2.length()) { 744 ch2 = name2.charAt(i2 ++); 745 } 746 else { 747 ch2 = 0; 748 } 749 } 750 751 // If we reach the ends of both strings then they match 752 if (ch1 == 0 && ch2 == 0) { 753 return 0; 754 } 755 756 // Case-insensitive comparison 757 if (ch1 != ch2) { 758 result = Character.toLowerCase(ch1) 759 - Character.toLowerCase(ch2); 760 if (result != 0) { 761 return result; 762 } 763 } 764 } 765 } 766 */ 767 768 /** 769 * Get the the maximum values for some enum/int properties. 770 * @return maximum values for the integer properties. 771 */ 772 public int getMaxValues(int column) 773 { 774 // return m_maxBlockScriptValue_; 775 776 switch(column) { 777 case 0: 778 return m_maxBlockScriptValue_; 779 case 2: 780 return m_maxJTGValue_; 781 default: 782 return 0; 783 } 784 } 785 786 /** 787 * Gets the type mask 788 * @param type character type 789 * @return mask 790 */ 791 public static final int getMask(int type) 792 { 793 return 1 << type; 794 } 795 796 797 /** 798 * Returns the digit values of characters like 'A' - 'Z', normal, 799 * half-width and full-width. This method assumes that the other digit 800 * characters are checked by the calling method. 801 * @param ch character to test 802 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise 803 * its corresponding digit will be returned. 804 */ 805 public static int getEuropeanDigit(int ch) { 806 if ((ch > 0x7a && ch < 0xff21) 807 || ch < 0x41 || (ch > 0x5a && ch < 0x61) 808 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { 809 return -1; 810 } 811 if (ch <= 0x7a) { 812 // ch >= 0x41 or ch < 0x61 813 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); 814 } 815 // ch >= 0xff21 816 if (ch <= 0xff3a) { 817 return ch + 10 - 0xff21; 818 } 819 // ch >= 0xff41 && ch <= 0xff5a 820 return ch + 10 - 0xff41; 821 } 822 823 public int digit(int c) { 824 int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; 825 if(value<=9) { 826 return value; 827 } else { 828 return -1; 829 } 830 } 831 832 public int getNumericValue(int c) { 833 // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit() 834 int ntv = getNumericTypeValue(getProperty(c)); 835 836 if(ntv==NTV_NONE_) { 837 return getEuropeanDigit(c); 838 } else if(ntv<NTV_DIGIT_START_) { 839 /* decimal digit */ 840 return ntv-NTV_DECIMAL_START_; 841 } else if(ntv<NTV_NUMERIC_START_) { 842 /* other digit */ 843 return ntv-NTV_DIGIT_START_; 844 } else if(ntv<NTV_FRACTION_START_) { 845 /* small integer */ 846 return ntv-NTV_NUMERIC_START_; 847 } else if(ntv<NTV_LARGE_START_) { 848 /* fraction */ 849 return -2; 850 } else if(ntv<NTV_BASE60_START_) { 851 /* large, single-significant-digit integer */ 852 int mant=(ntv>>5)-14; 853 int exp=(ntv&0x1f)+2; 854 if(exp<9 || (exp==9 && mant<=2)) { 855 int numValue=mant; 856 do { 857 numValue*=10; 858 } while(--exp>0); 859 return numValue; 860 } else { 861 return -2; 862 } 863 } else if(ntv<NTV_FRACTION20_START_) { 864 /* sexagesimal (base 60) integer */ 865 int numValue=(ntv>>2)-0xbf; 866 int exp=(ntv&3)+1; 867 868 switch(exp) { 869 case 4: 870 numValue*=60*60*60*60; 871 break; 872 case 3: 873 numValue*=60*60*60; 874 break; 875 case 2: 876 numValue*=60*60; 877 break; 878 case 1: 879 numValue*=60; 880 break; 881 case 0: 882 default: 883 break; 884 } 885 886 return numValue; 887 } else if(ntv<NTV_RESERVED_START_) { 888 // fraction-20 e.g. 3/80 889 return -2; 890 } else { 891 /* reserved */ 892 return -2; 893 } 894 } 895 896 public double getUnicodeNumericValue(int c) { 897 // equivalent to c version double u_getNumericValue(UChar32 c) 898 int ntv = getNumericTypeValue(getProperty(c)); 899 900 if(ntv==NTV_NONE_) { 901 return UCharacter.NO_NUMERIC_VALUE; 902 } else if(ntv<NTV_DIGIT_START_) { 903 /* decimal digit */ 904 return ntv-NTV_DECIMAL_START_; 905 } else if(ntv<NTV_NUMERIC_START_) { 906 /* other digit */ 907 return ntv-NTV_DIGIT_START_; 908 } else if(ntv<NTV_FRACTION_START_) { 909 /* small integer */ 910 return ntv-NTV_NUMERIC_START_; 911 } else if(ntv<NTV_LARGE_START_) { 912 /* fraction */ 913 int numerator=(ntv>>4)-12; 914 int denominator=(ntv&0xf)+1; 915 return (double)numerator/denominator; 916 } else if(ntv<NTV_BASE60_START_) { 917 /* large, single-significant-digit integer */ 918 double numValue; 919 int mant=(ntv>>5)-14; 920 int exp=(ntv&0x1f)+2; 921 numValue=mant; 922 923 /* multiply by 10^exp without math.h */ 924 while(exp>=4) { 925 numValue*=10000.; 926 exp-=4; 927 } 928 switch(exp) { 929 case 3: 930 numValue*=1000.; 931 break; 932 case 2: 933 numValue*=100.; 934 break; 935 case 1: 936 numValue*=10.; 937 break; 938 case 0: 939 default: 940 break; 941 } 942 943 return numValue; 944 } else if(ntv<NTV_FRACTION20_START_) { 945 /* sexagesimal (base 60) integer */ 946 int numValue=(ntv>>2)-0xbf; 947 int exp=(ntv&3)+1; 948 949 switch(exp) { 950 case 4: 951 numValue*=60*60*60*60; 952 break; 953 case 3: 954 numValue*=60*60*60; 955 break; 956 case 2: 957 numValue*=60*60; 958 break; 959 case 1: 960 numValue*=60; 961 break; 962 case 0: 963 default: 964 break; 965 } 966 967 return numValue; 968 } else if(ntv<NTV_RESERVED_START_) { 969 // fraction-20 e.g. 3/80 970 int frac20=ntv-NTV_FRACTION20_START_; // 0..0x17 971 int numerator=2*(frac20&3)+1; 972 int denominator=20<<(frac20>>2); 973 return (double)numerator/denominator; 974 } else { 975 /* reserved */ 976 return UCharacter.NO_NUMERIC_VALUE; 977 } 978 } 979 980 // protected variables ----------------------------------------------- 981 982 /** 983 * Extra property trie 984 */ 985 Trie2_16 m_additionalTrie_; 986 /** 987 * Extra property vectors, 1st column for age and second for binary 988 * properties. 989 */ 990 int m_additionalVectors_[]; 991 /** 992 * Number of additional columns 993 */ 994 int m_additionalColumnsCount_; 995 /** 996 * Maximum values for block, bits used as in vector word 997 * 0 998 */ 999 int m_maxBlockScriptValue_; 1000 /** 1001 * Maximum values for script, bits used as in vector word 1002 * 0 1003 */ 1004 int m_maxJTGValue_; 1005 1006 /** 1007 * Script_Extensions data 1008 */ 1009 public char[] m_scriptExtensions_; 1010 1011 // private variables ------------------------------------------------- 1012 1013 /** 1014 * Default name of the datafile 1015 */ 1016 private static final String DATA_FILE_NAME_ = "uprops.icu"; 1017 1018 // property data constants ------------------------------------------------- 1019 1020 /** 1021 * Numeric types and values in the main properties words. 1022 */ 1023 private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; 1024 private static final int getNumericTypeValue(int props) { 1025 return props >> NUMERIC_TYPE_VALUE_SHIFT_; 1026 } 1027 /* constants for the storage form of numeric types and values */ 1028 /** No numeric value. */ 1029 private static final int NTV_NONE_ = 0; 1030 /** Decimal digits: nv=0..9 */ 1031 private static final int NTV_DECIMAL_START_ = 1; 1032 /** Other digits: nv=0..9 */ 1033 private static final int NTV_DIGIT_START_ = 11; 1034 /** Small integers: nv=0..154 */ 1035 private static final int NTV_NUMERIC_START_ = 21; 1036 /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */ 1037 private static final int NTV_FRACTION_START_ = 0xb0; 1038 /** 1039 * Large integers: 1040 * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) 1041 * (only one significant decimal digit) 1042 */ 1043 private static final int NTV_LARGE_START_ = 0x1e0; 1044 /** 1045 * Sexagesimal numbers: 1046 * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) 1047 */ 1048 private static final int NTV_BASE60_START_=0x300; 1049 /** 1050 * Fraction-20 values: 1051 * frac20 = ntv-0x324 = 0..0x17 -> 1|3|5|7 / 20|40|80|160|320|640 1052 * numerator: num = 2*(frac20&3)+1 1053 * denominator: den = 20<<(frac20>>2) 1054 */ 1055 private static final int NTV_FRACTION20_START_ = NTV_BASE60_START_ + 36; // 0x300+9*4=0x324 1056 /** No numeric value (yet). */ 1057 private static final int NTV_RESERVED_START_ = NTV_FRACTION20_START_ + 24; // 0x324+6*4=0x34c 1058 1059 private static final int ntvGetType(int ntv) { 1060 return 1061 (ntv==NTV_NONE_) ? NumericType.NONE : 1062 (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL : 1063 (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT : 1064 NumericType.NUMERIC; 1065 } 1066 1067 /* 1068 * Properties in vector word 0 1069 * Bits 1070 * 31..24 DerivedAge version major/minor one nibble each 1071 * 23..22 3..1: Bits 7..0 = Script_Extensions index 1072 * 3: Script value from Script_Extensions 1073 * 2: Script=Inherited 1074 * 1: Script=Common 1075 * 0: Script=bits 7..0 1076 * 21..20 reserved 1077 * 19..17 East Asian Width 1078 * 16.. 8 UBlockCode 1079 * 7.. 0 UScriptCode 1080 */ 1081 1082 /** 1083 * Script_Extensions: mask includes Script 1084 */ 1085 public static final int SCRIPT_X_MASK = 0x00c000ff; 1086 //private static final int SCRIPT_X_SHIFT = 22; 1087 /** 1088 * Integer properties mask and shift values for East Asian cell width. 1089 * Equivalent to icu4c UPROPS_EA_MASK 1090 */ 1091 private static final int EAST_ASIAN_MASK_ = 0x000e0000; 1092 /** 1093 * Integer properties mask and shift values for East Asian cell width. 1094 * Equivalent to icu4c UPROPS_EA_SHIFT 1095 */ 1096 private static final int EAST_ASIAN_SHIFT_ = 17; 1097 /** 1098 * Integer properties mask and shift values for blocks. 1099 * Equivalent to icu4c UPROPS_BLOCK_MASK 1100 */ 1101 private static final int BLOCK_MASK_ = 0x0001ff00; 1102 /** 1103 * Integer properties mask and shift values for blocks. 1104 * Equivalent to icu4c UPROPS_BLOCK_SHIFT 1105 */ 1106 private static final int BLOCK_SHIFT_ = 8; 1107 /** 1108 * Integer properties mask and shift values for scripts. 1109 * Equivalent to icu4c UPROPS_SHIFT_MASK 1110 */ 1111 public static final int SCRIPT_MASK_ = 0x000000ff; 1112 1113 /* SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */ 1114 public static final int SCRIPT_X_WITH_COMMON = 0x400000; 1115 public static final int SCRIPT_X_WITH_INHERITED = 0x800000; 1116 public static final int SCRIPT_X_WITH_OTHER = 0xc00000; 1117 1118 /** 1119 * Additional properties used in internal trie data 1120 */ 1121 /* 1122 * Properties in vector word 1 1123 * Each bit encodes one binary property. 1124 * The following constants represent the bit number, use 1<<UPROPS_XYZ. 1125 * UPROPS_BINARY_1_TOP<=32! 1126 * 1127 * Keep this list of property enums in sync with 1128 * propListNames[] in icu/source/tools/genprops/props2.c! 1129 * 1130 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". 1131 */ 1132 private static final int WHITE_SPACE_PROPERTY_ = 0; 1133 private static final int DASH_PROPERTY_ = 1; 1134 private static final int HYPHEN_PROPERTY_ = 2; 1135 private static final int QUOTATION_MARK_PROPERTY_ = 3; 1136 private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4; 1137 private static final int MATH_PROPERTY_ = 5; 1138 private static final int HEX_DIGIT_PROPERTY_ = 6; 1139 private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7; 1140 private static final int ALPHABETIC_PROPERTY_ = 8; 1141 private static final int IDEOGRAPHIC_PROPERTY_ = 9; 1142 private static final int DIACRITIC_PROPERTY_ = 10; 1143 private static final int EXTENDER_PROPERTY_ = 11; 1144 private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12; 1145 private static final int GRAPHEME_EXTEND_PROPERTY_ = 13; 1146 private static final int GRAPHEME_LINK_PROPERTY_ = 14; 1147 private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15; 1148 private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16; 1149 private static final int RADICAL_PROPERTY_ = 17; 1150 private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18; 1151 private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19; 1152 private static final int DEPRECATED_PROPERTY_ = 20; 1153 private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21; 1154 private static final int XID_START_PROPERTY_ = 22; 1155 private static final int XID_CONTINUE_PROPERTY_ = 23; 1156 private static final int ID_START_PROPERTY_ = 24; 1157 private static final int ID_CONTINUE_PROPERTY_ = 25; 1158 private static final int GRAPHEME_BASE_PROPERTY_ = 26; 1159 private static final int S_TERM_PROPERTY_ = 27; 1160 private static final int VARIATION_SELECTOR_PROPERTY_ = 28; 1161 private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */ 1162 private static final int PATTERN_WHITE_SPACE = 30; 1163 private static final int PREPENDED_CONCATENATION_MARK = 31; // new in ICU 60 and Unicode 10 1164 1165 /* 1166 * Properties in vector word 2 1167 * Bits 1168 * 31..27 http://www.unicode.org/reports/tr51/#Emoji_Properties 1169 * 26 reserved 1170 * 25..20 Line Break 1171 * 19..15 Sentence Break 1172 * 14..10 Word Break 1173 * 9.. 5 Grapheme Cluster Break 1174 * 4.. 0 Decomposition Type 1175 */ 1176 private static final int PROPS_2_EMOJI_COMPONENT = 27; 1177 private static final int PROPS_2_EMOJI = 28; 1178 private static final int PROPS_2_EMOJI_PRESENTATION = 29; 1179 private static final int PROPS_2_EMOJI_MODIFIER = 30; 1180 private static final int PROPS_2_EMOJI_MODIFIER_BASE = 31; 1181 1182 private static final int LB_MASK = 0x03f00000; 1183 private static final int LB_SHIFT = 20; 1184 1185 private static final int SB_MASK = 0x000f8000; 1186 private static final int SB_SHIFT = 15; 1187 1188 private static final int WB_MASK = 0x00007c00; 1189 private static final int WB_SHIFT = 10; 1190 1191 private static final int GCB_MASK = 0x000003e0; 1192 private static final int GCB_SHIFT = 5; 1193 1194 /** 1195 * Integer properties mask for decomposition type. 1196 * Equivalent to icu4c UPROPS_DT_MASK. 1197 */ 1198 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; 1199 1200 /** 1201 * First nibble shift 1202 */ 1203 private static final int FIRST_NIBBLE_SHIFT_ = 0x4; 1204 /** 1205 * Second nibble mask 1206 */ 1207 private static final int LAST_NIBBLE_MASK_ = 0xF; 1208 /** 1209 * Age value shift 1210 */ 1211 private static final int AGE_SHIFT_ = 24; 1212 1213 1214 // private constructors -------------------------------------------------- 1215 1216 /** 1217 * Constructor 1218 * @exception IOException thrown when data reading fails or data corrupted 1219 */ 1220 private UCharacterProperty() throws IOException 1221 { 1222 // consistency check 1223 if(binProps.length!=UProperty.BINARY_LIMIT) { 1224 throw new ICUException("binProps.length!=UProperty.BINARY_LIMIT"); 1225 } 1226 if(intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)) { 1227 throw new ICUException("intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)"); 1228 } 1229 1230 // jar access 1231 ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_); 1232 m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable()); 1233 // Read or skip the 16 indexes. 1234 int propertyOffset = bytes.getInt(); 1235 /* exceptionOffset = */ bytes.getInt(); 1236 /* caseOffset = */ bytes.getInt(); 1237 int additionalOffset = bytes.getInt(); 1238 int additionalVectorsOffset = bytes.getInt(); 1239 m_additionalColumnsCount_ = bytes.getInt(); 1240 int scriptExtensionsOffset = bytes.getInt(); 1241 int reservedOffset7 = bytes.getInt(); 1242 /* reservedOffset8 = */ bytes.getInt(); 1243 /* dataTopOffset = */ bytes.getInt(); 1244 m_maxBlockScriptValue_ = bytes.getInt(); 1245 m_maxJTGValue_ = bytes.getInt(); 1246 ICUBinary.skipBytes(bytes, (16 - 12) << 2); 1247 1248 // read the main properties trie 1249 m_trie_ = Trie2_16.createFromSerialized(bytes); 1250 int expectedTrieLength = (propertyOffset - 16) * 4; 1251 int trieLength = m_trie_.getSerializedLength(); 1252 if(trieLength > expectedTrieLength) { 1253 throw new IOException("uprops.icu: not enough bytes for main trie"); 1254 } 1255 // skip padding after trie bytes 1256 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 1257 1258 // skip unused intervening data structures 1259 ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); 1260 1261 if(m_additionalColumnsCount_ > 0) { 1262 // reads the additional property block 1263 m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); 1264 expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; 1265 trieLength = m_additionalTrie_.getSerializedLength(); 1266 if(trieLength > expectedTrieLength) { 1267 throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); 1268 } 1269 // skip padding after trie bytes 1270 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 1271 1272 // additional properties 1273 int size = scriptExtensionsOffset - additionalVectorsOffset; 1274 m_additionalVectors_ = ICUBinary.getInts(bytes, size, 0); 1275 } 1276 1277 // Script_Extensions 1278 int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; 1279 if(numChars > 0) { 1280 m_scriptExtensions_ = ICUBinary.getChars(bytes, numChars, 0); 1281 } 1282 } 1283 1284 private static final class IsAcceptable implements ICUBinary.Authenticate { 1285 // @Override when we switch to Java 6 1286 @Override 1287 public boolean isDataVersionAcceptable(byte version[]) { 1288 return version[0] == 7; 1289 } 1290 } 1291 private static final int DATA_FORMAT = 0x5550726F; // "UPro" 1292 1293 // private methods ------------------------------------------------------- 1294 1295 /* 1296 * Compare additional properties to see if it has argument type 1297 * @param property 32 bit properties 1298 * @param type character type 1299 * @return true if property has type 1300 */ 1301 /*private boolean compareAdditionalType(int property, int type) 1302 { 1303 return (property & (1 << type)) != 0; 1304 }*/ 1305 1306 // property starts for UnicodeSet -------------------------------------- *** 1307 1308 private static final int TAB = 0x0009; 1309 //private static final int LF = 0x000a; 1310 //private static final int FF = 0x000c; 1311 private static final int CR = 0x000d; 1312 private static final int U_A = 0x0041; 1313 private static final int U_F = 0x0046; 1314 private static final int U_Z = 0x005a; 1315 private static final int U_a = 0x0061; 1316 private static final int U_f = 0x0066; 1317 private static final int U_z = 0x007a; 1318 private static final int DEL = 0x007f; 1319 private static final int NL = 0x0085; 1320 private static final int NBSP = 0x00a0; 1321 private static final int CGJ = 0x034f; 1322 private static final int FIGURESP= 0x2007; 1323 private static final int HAIRSP = 0x200a; 1324 //private static final int ZWNJ = 0x200c; 1325 //private static final int ZWJ = 0x200d; 1326 private static final int RLM = 0x200f; 1327 private static final int NNBSP = 0x202f; 1328 private static final int WJ = 0x2060; 1329 private static final int INHSWAP = 0x206a; 1330 private static final int NOMDIG = 0x206f; 1331 private static final int U_FW_A = 0xff21; 1332 private static final int U_FW_F = 0xff26; 1333 private static final int U_FW_Z = 0xff3a; 1334 private static final int U_FW_a = 0xff41; 1335 private static final int U_FW_f = 0xff46; 1336 private static final int U_FW_z = 0xff5a; 1337 private static final int ZWNBSP = 0xfeff; 1338 1339 public UnicodeSet addPropertyStarts(UnicodeSet set) { 1340 /* add the start code point of each same-value range of the main trie */ 1341 Iterator<Trie2.Range> trieIterator = m_trie_.iterator(); 1342 Trie2.Range range; 1343 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 1344 set.add(range.startCodePoint); 1345 } 1346 1347 /* add code points with hardcoded properties, plus the ones following them */ 1348 1349 /* add for u_isblank() */ 1350 set.add(TAB); 1351 set.add(TAB+1); 1352 1353 /* add for IS_THAT_CONTROL_SPACE() */ 1354 set.add(CR+1); /* range TAB..CR */ 1355 set.add(0x1c); 1356 set.add(0x1f+1); 1357 set.add(NL); 1358 set.add(NL+1); 1359 1360 /* add for u_isIDIgnorable() what was not added above */ 1361 set.add(DEL); /* range DEL..NBSP-1, NBSP added below */ 1362 set.add(HAIRSP); 1363 set.add(RLM+1); 1364 set.add(INHSWAP); 1365 set.add(NOMDIG+1); 1366 set.add(ZWNBSP); 1367 set.add(ZWNBSP+1); 1368 1369 /* add no-break spaces for u_isWhitespace() what was not added above */ 1370 set.add(NBSP); 1371 set.add(NBSP+1); 1372 set.add(FIGURESP); 1373 set.add(FIGURESP+1); 1374 set.add(NNBSP); 1375 set.add(NNBSP+1); 1376 1377 /* add for u_charDigitValue() */ 1378 // TODO remove when UCharacter.getHanNumericValue() is changed to just return 1379 // Unicode numeric values 1380 set.add(0x3007); 1381 set.add(0x3008); 1382 set.add(0x4e00); 1383 set.add(0x4e01); 1384 set.add(0x4e8c); 1385 set.add(0x4e8d); 1386 set.add(0x4e09); 1387 set.add(0x4e0a); 1388 set.add(0x56db); 1389 set.add(0x56dc); 1390 set.add(0x4e94); 1391 set.add(0x4e95); 1392 set.add(0x516d); 1393 set.add(0x516e); 1394 set.add(0x4e03); 1395 set.add(0x4e04); 1396 set.add(0x516b); 1397 set.add(0x516c); 1398 set.add(0x4e5d); 1399 set.add(0x4e5e); 1400 1401 /* add for u_digit() */ 1402 set.add(U_a); 1403 set.add(U_z+1); 1404 set.add(U_A); 1405 set.add(U_Z+1); 1406 set.add(U_FW_a); 1407 set.add(U_FW_z+1); 1408 set.add(U_FW_A); 1409 set.add(U_FW_Z+1); 1410 1411 /* add for u_isxdigit() */ 1412 set.add(U_f+1); 1413 set.add(U_F+1); 1414 set.add(U_FW_f+1); 1415 set.add(U_FW_F+1); 1416 1417 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ 1418 set.add(WJ); /* range WJ..NOMDIG */ 1419 set.add(0xfff0); 1420 set.add(0xfffb+1); 1421 set.add(0xe0000); 1422 set.add(0xe0fff+1); 1423 1424 /* add for UCHAR_GRAPHEME_BASE and others */ 1425 set.add(CGJ); 1426 set.add(CGJ+1); 1427 1428 return set; // for chaining 1429 } 1430 1431 public void upropsvec_addPropertyStarts(UnicodeSet set) { 1432 /* add the start code point of each same-value range of the properties vectors trie */ 1433 if(m_additionalColumnsCount_>0) { 1434 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ 1435 Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator(); 1436 Trie2.Range range; 1437 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 1438 set.add(range.startCodePoint); 1439 } 1440 } 1441 } 1442 1443 // This static initializer block must be placed after 1444 // other static member initialization 1445 static { 1446 try { 1447 INSTANCE = new UCharacterProperty(); 1448 } 1449 catch (IOException e) { 1450 throw new MissingResourceException(e.getMessage(),"",""); 1451 } 1452 } 1453 1454 /*---------------------------------------------------------------- 1455 * Inclusions list 1456 *----------------------------------------------------------------*/ 1457 1458 /* 1459 * Return a set of characters for property enumeration. 1460 * The set implicitly contains 0x110000 as well, which is one more than the highest 1461 * Unicode code point. 1462 * 1463 * This set is used as an ordered list - its code points are ordered, and 1464 * consecutive code points (in Unicode code point order) in the set define a range. 1465 * For each two consecutive characters (start, limit) in the set, 1466 * all of the UCD/normalization and related properties for 1467 * all code points start..limit-1 are all the same, 1468 * except for character names and ISO comments. 1469 * 1470 * All Unicode code points U+0000..U+10ffff are covered by these ranges. 1471 * The ranges define a partition of the Unicode code space. 1472 * ICU uses the inclusions set to enumerate properties for generating 1473 * UnicodeSets containing all code points that have a certain property value. 1474 * 1475 * The Inclusion List is generated from the UCD. It is generated 1476 * by enumerating the data tries, and code points for hardcoded properties 1477 * are added as well. 1478 * 1479 * -------------------------------------------------------------------------- 1480 * 1481 * The following are ideas for getting properties-unique code point ranges, 1482 * with possible optimizations beyond the current implementation. 1483 * These optimizations would require more code and be more fragile. 1484 * The current implementation generates one single list (set) for all properties. 1485 * 1486 * To enumerate properties efficiently, one needs to know ranges of 1487 * repetitive values, so that the value of only each start code point 1488 * can be applied to the whole range. 1489 * This information is in principle available in the uprops.icu/unorm.icu data. 1490 * 1491 * There are two obstacles: 1492 * 1493 * 1. Some properties are computed from multiple data structures, 1494 * making it necessary to get repetitive ranges by intersecting 1495 * ranges from multiple tries. 1496 * 1497 * 2. It is not economical to write code for getting repetitive ranges 1498 * that are precise for each of some 50 properties. 1499 * 1500 * Compromise ideas: 1501 * 1502 * - Get ranges per trie, not per individual property. 1503 * Each range contains the same values for a whole group of properties. 1504 * This would generate currently five range sets, two for uprops.icu tries 1505 * and three for unorm.icu tries. 1506 * 1507 * - Combine sets of ranges for multiple tries to get sufficient sets 1508 * for properties, e.g., the uprops.icu main and auxiliary tries 1509 * for all non-normalization properties. 1510 * 1511 * Ideas for representing ranges and combining them: 1512 * 1513 * - A UnicodeSet could hold just the start code points of ranges. 1514 * Multiple sets are easily combined by or-ing them together. 1515 * 1516 * - Alternatively, a UnicodeSet could hold each even-numbered range. 1517 * All ranges could be enumerated by using each start code point 1518 * (for the even-numbered ranges) as well as each limit (end+1) code point 1519 * (for the odd-numbered ranges). 1520 * It should be possible to combine two such sets by xor-ing them, 1521 * but no more than two. 1522 * 1523 * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays, 1524 * but the first one is certainly simpler and applicable for combining more than 1525 * two range sets. 1526 * 1527 * It is possible to combine all range sets for all uprops/unorm tries into one 1528 * set that can be used for all properties. 1529 * As an optimization, there could be less-combined range sets for certain 1530 * groups of properties. 1531 * The relationship of which less-combined range set to use for which property 1532 * depends on the implementation of the properties and must be hardcoded 1533 * - somewhat error-prone and higher maintenance but can be tested easily 1534 * by building property sets "the simple way" in test code. 1535 * 1536 * --- 1537 * 1538 * Do not use a UnicodeSet pattern because that causes infinite recursion; 1539 * UnicodeSet depends on the inclusions set. 1540 * 1541 * --- 1542 * 1543 * getInclusions() is commented out starting 2005-feb-12 because 1544 * UnicodeSet now calls the uxyz_addPropertyStarts() directly, 1545 * and only for the relevant property source. 1546 */ 1547 /* 1548 public UnicodeSet getInclusions() { 1549 UnicodeSet set = new UnicodeSet(); 1550 NormalizerImpl.addPropertyStarts(set); 1551 addPropertyStarts(set); 1552 return set; 1553 } 1554 */ 1555 } 1556