1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.impl; 9 10 import java.io.IOException; 11 import java.nio.ByteBuffer; 12 import java.util.Iterator; 13 import java.util.MissingResourceException; 14 15 import com.ibm.icu.lang.UCharacter; 16 import com.ibm.icu.lang.UCharacter.HangulSyllableType; 17 import com.ibm.icu.lang.UCharacter.NumericType; 18 import com.ibm.icu.lang.UCharacterCategory; 19 import com.ibm.icu.lang.UProperty; 20 import com.ibm.icu.lang.UScript; 21 import com.ibm.icu.text.Normalizer2; 22 import com.ibm.icu.text.UTF16; 23 import com.ibm.icu.text.UnicodeSet; 24 import com.ibm.icu.util.ICUException; 25 import com.ibm.icu.util.VersionInfo; 26 27 /** 28 * <p>Internal class used for Unicode character property database.</p> 29 * <p>This classes store binary data read from uprops.icu. 30 * It does not have the capability to parse the data into more high-level 31 * information. It only returns bytes of information when required.</p> 32 * <p>Due to the form most commonly used for retrieval, array of char is used 33 * to store the binary data.</p> 34 * <p>UCharacterPropertyDB also contains information on accessing indexes to 35 * significant points in the binary data.</p> 36 * <p>Responsibility for molding the binary data into more meaning form lies on 37 * <a href=UCharacter.html>UCharacter</a>.</p> 38 * @author Syn Wee Quek 39 * @since release 2.1, february 1st 2002 40 */ 41 42 public final class UCharacterProperty 43 { 44 // public data members ----------------------------------------------- 45 46 /* 47 * public singleton instance 48 */ 49 public static final UCharacterProperty INSTANCE; 50 51 /** 52 * Trie data 53 */ 54 public Trie2_16 m_trie_; 55 /** 56 * Unicode version 57 */ 58 public VersionInfo m_unicodeVersion_; 59 /** 60 * Latin capital letter i with dot above 61 */ 62 public static final char LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_ = 0x130; 63 /** 64 * Latin small letter i with dot above 65 */ 66 public static final char LATIN_SMALL_LETTER_DOTLESS_I_ = 0x131; 67 /** 68 * Latin lowercase i 69 */ 70 public static final char LATIN_SMALL_LETTER_I_ = 0x69; 71 /** 72 * Character type mask 73 */ 74 public static final int TYPE_MASK = 0x1F; 75 76 // uprops.h enum UPropertySource --------------------------------------- *** 77 78 /** No source, not a supported property. */ 79 public static final int SRC_NONE=0; 80 /** From uchar.c/uprops.icu main trie */ 81 public static final int SRC_CHAR=1; 82 /** From uchar.c/uprops.icu properties vectors trie */ 83 public static final int SRC_PROPSVEC=2; 84 /** From unames.c/unames.icu */ 85 public static final int SRC_NAMES=3; 86 /** From ucase.c/ucase.icu */ 87 public static final int SRC_CASE=4; 88 /** From ubidi_props.c/ubidi.icu */ 89 public static final int SRC_BIDI=5; 90 /** From uchar.c/uprops.icu main trie as well as properties vectors trie */ 91 public static final int SRC_CHAR_AND_PROPSVEC=6; 92 /** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */ 93 public static final int SRC_CASE_AND_NORM=7; 94 /** From normalizer2impl.cpp/nfc.nrm */ 95 public static final int SRC_NFC=8; 96 /** From normalizer2impl.cpp/nfkc.nrm */ 97 public static final int SRC_NFKC=9; 98 /** From normalizer2impl.cpp/nfkc_cf.nrm */ 99 public static final int SRC_NFKC_CF=10; 100 /** From normalizer2impl.cpp/nfc.nrm canonical iterator data */ 101 public static final int SRC_NFC_CANON_ITER=11; 102 /** One more than the highest UPropertySource (SRC_) constant. */ 103 public static final int SRC_COUNT=12; 104 105 // public methods ---------------------------------------------------- 106 107 /** 108 * Gets the main property value for code point ch. 109 * @param ch code point whose property value is to be retrieved 110 * @return property value of code point 111 */ 112 public final int getProperty(int ch) 113 { 114 return m_trie_.get(ch); 115 } 116 117 /** 118 * Gets the unicode additional properties. 119 * Java version of C u_getUnicodeProperties(). 120 * @param codepoint codepoint whose additional properties is to be 121 * retrieved 122 * @param column The column index. 123 * @return unicode properties 124 */ 125 public int getAdditional(int codepoint, int column) { 126 assert column >= 0; 127 if (column >= m_additionalColumnsCount_) { 128 return 0; 129 } 130 return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; 131 } 132 133 static final int MY_MASK = UCharacterProperty.TYPE_MASK 134 & ((1<<UCharacterCategory.UPPERCASE_LETTER) | 135 (1<<UCharacterCategory.LOWERCASE_LETTER) | 136 (1<<UCharacterCategory.TITLECASE_LETTER) | 137 (1<<UCharacterCategory.MODIFIER_LETTER) | 138 (1<<UCharacterCategory.OTHER_LETTER)); 139 140 141 /** 142 * <p>Get the "age" of the code point.</p> 143 * <p>The "age" is the Unicode version when the code point was first 144 * designated (as a non-character or for Private Use) or assigned a 145 * character.</p> 146 * <p>This can be useful to avoid emitting code points to receiving 147 * processes that do not accept newer characters.</p> 148 * <p>The data is from the UCD file DerivedAge.txt.</p> 149 * <p>This API does not check the validity of the codepoint.</p> 150 * @param codepoint The code point. 151 * @return the Unicode version number 152 */ 153 public VersionInfo getAge(int codepoint) 154 { 155 int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; 156 return VersionInfo.getInstance( 157 (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, 158 version & LAST_NIBBLE_MASK_, 0, 0); 159 } 160 161 private static final int GC_CN_MASK = getMask(UCharacter.UNASSIGNED); 162 private static final int GC_CC_MASK = getMask(UCharacter.CONTROL); 163 private static final int GC_CS_MASK = getMask(UCharacter.SURROGATE); 164 private static final int GC_ZS_MASK = getMask(UCharacter.SPACE_SEPARATOR); 165 private static final int GC_ZL_MASK = getMask(UCharacter.LINE_SEPARATOR); 166 private static final int GC_ZP_MASK = getMask(UCharacter.PARAGRAPH_SEPARATOR); 167 /** Mask constant for multiple UCharCategory bits (Z Separators). */ 168 private static final int GC_Z_MASK = GC_ZS_MASK|GC_ZL_MASK|GC_ZP_MASK; 169 170 /** 171 * Checks if c is in 172 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] 173 * with space=\p{Whitespace} and Control=Cc. 174 * Implements UCHAR_POSIX_GRAPH. 175 * @internal 176 */ 177 private static final boolean isgraphPOSIX(int c) { 178 /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ 179 /* comparing ==0 returns FALSE for the categories mentioned */ 180 return (getMask(UCharacter.getType(c))& 181 (GC_CC_MASK|GC_CS_MASK|GC_CN_MASK|GC_Z_MASK)) 182 ==0; 183 } 184 185 // binary properties --------------------------------------------------- *** 186 187 private class BinaryProperty { 188 int column; // SRC_PROPSVEC column, or "source" if mask==0 189 int mask; 190 BinaryProperty(int column, int mask) { 191 this.column=column; 192 this.mask=mask; 193 } 194 BinaryProperty(int source) { 195 this.column=source; 196 this.mask=0; 197 } 198 final int getSource() { 199 return mask==0 ? column : SRC_PROPSVEC; 200 } 201 boolean contains(int c) { 202 // systematic, directly stored properties 203 return (getAdditional(c, column)&mask)!=0; 204 } 205 } 206 207 private class CaseBinaryProperty extends BinaryProperty { // case mapping properties 208 int which; 209 CaseBinaryProperty(int which) { 210 super(SRC_CASE); 211 this.which=which; 212 } 213 boolean contains(int c) { 214 return UCaseProps.INSTANCE.hasBinaryProperty(c, which); 215 } 216 } 217 218 private class NormInertBinaryProperty extends BinaryProperty { // UCHAR_NF*_INERT properties 219 int which; 220 NormInertBinaryProperty(int source, int which) { 221 super(source); 222 this.which=which; 223 } 224 boolean contains(int c) { 225 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c); 226 } 227 } 228 229 BinaryProperty[] binProps={ 230 /* 231 * Binary-property implementations must be in order of corresponding UProperty, 232 * and there must be exactly one entry per binary UProperty. 233 */ 234 new BinaryProperty(1, (1<<ALPHABETIC_PROPERTY_)), 235 new BinaryProperty(1, (1<<ASCII_HEX_DIGIT_PROPERTY_)), 236 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_CONTROL 237 boolean contains(int c) { 238 return UBiDiProps.INSTANCE.isBidiControl(c); 239 } 240 }, 241 new BinaryProperty(SRC_BIDI) { // UCHAR_BIDI_MIRRORED 242 boolean contains(int c) { 243 return UBiDiProps.INSTANCE.isMirrored(c); 244 } 245 }, 246 new BinaryProperty(1, (1<<DASH_PROPERTY_)), 247 new BinaryProperty(1, (1<<DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_)), 248 new BinaryProperty(1, (1<<DEPRECATED_PROPERTY_)), 249 new BinaryProperty(1, (1<<DIACRITIC_PROPERTY_)), 250 new BinaryProperty(1, (1<<EXTENDER_PROPERTY_)), 251 new BinaryProperty(SRC_NFC) { // UCHAR_FULL_COMPOSITION_EXCLUSION 252 boolean contains(int c) { 253 // By definition, Full_Composition_Exclusion is the same as NFC_QC=No. 254 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl; 255 return impl.isCompNo(impl.getNorm16(c)); 256 } 257 }, 258 new BinaryProperty(1, (1<<GRAPHEME_BASE_PROPERTY_)), 259 new BinaryProperty(1, (1<<GRAPHEME_EXTEND_PROPERTY_)), 260 new BinaryProperty(1, (1<<GRAPHEME_LINK_PROPERTY_)), 261 new BinaryProperty(1, (1<<HEX_DIGIT_PROPERTY_)), 262 new BinaryProperty(1, (1<<HYPHEN_PROPERTY_)), 263 new BinaryProperty(1, (1<<ID_CONTINUE_PROPERTY_)), 264 new BinaryProperty(1, (1<<ID_START_PROPERTY_)), 265 new BinaryProperty(1, (1<<IDEOGRAPHIC_PROPERTY_)), 266 new BinaryProperty(1, (1<<IDS_BINARY_OPERATOR_PROPERTY_)), 267 new BinaryProperty(1, (1<<IDS_TRINARY_OPERATOR_PROPERTY_)), 268 new BinaryProperty(SRC_BIDI) { // UCHAR_JOIN_CONTROL 269 boolean contains(int c) { 270 return UBiDiProps.INSTANCE.isJoinControl(c); 271 } 272 }, 273 new BinaryProperty(1, (1<<LOGICAL_ORDER_EXCEPTION_PROPERTY_)), 274 new CaseBinaryProperty(UProperty.LOWERCASE), 275 new BinaryProperty(1, (1<<MATH_PROPERTY_)), 276 new BinaryProperty(1, (1<<NONCHARACTER_CODE_POINT_PROPERTY_)), 277 new BinaryProperty(1, (1<<QUOTATION_MARK_PROPERTY_)), 278 new BinaryProperty(1, (1<<RADICAL_PROPERTY_)), 279 new CaseBinaryProperty(UProperty.SOFT_DOTTED), 280 new BinaryProperty(1, (1<<TERMINAL_PUNCTUATION_PROPERTY_)), 281 new BinaryProperty(1, (1<<UNIFIED_IDEOGRAPH_PROPERTY_)), 282 new CaseBinaryProperty(UProperty.UPPERCASE), 283 new BinaryProperty(1, (1<<WHITE_SPACE_PROPERTY_)), 284 new BinaryProperty(1, (1<<XID_CONTINUE_PROPERTY_)), 285 new BinaryProperty(1, (1<<XID_START_PROPERTY_)), 286 new CaseBinaryProperty(UProperty.CASE_SENSITIVE), 287 new BinaryProperty(1, (1<<S_TERM_PROPERTY_)), 288 new BinaryProperty(1, (1<<VARIATION_SELECTOR_PROPERTY_)), 289 new NormInertBinaryProperty(SRC_NFC, UProperty.NFD_INERT), 290 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKD_INERT), 291 new NormInertBinaryProperty(SRC_NFC, UProperty.NFC_INERT), 292 new NormInertBinaryProperty(SRC_NFKC, UProperty.NFKC_INERT), 293 new BinaryProperty(SRC_NFC_CANON_ITER) { // UCHAR_SEGMENT_STARTER 294 boolean contains(int c) { 295 return Norm2AllModes.getNFCInstance().impl. 296 ensureCanonIterData().isCanonSegmentStarter(c); 297 } 298 }, 299 new BinaryProperty(1, (1<<PATTERN_SYNTAX)), 300 new BinaryProperty(1, (1<<PATTERN_WHITE_SPACE)), 301 new BinaryProperty(SRC_CHAR_AND_PROPSVEC) { // UCHAR_POSIX_ALNUM 302 boolean contains(int c) { 303 return UCharacter.isUAlphabetic(c) || UCharacter.isDigit(c); 304 } 305 }, 306 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_BLANK 307 boolean contains(int c) { 308 // "horizontal space" 309 if(c<=0x9f) { 310 return c==9 || c==0x20; /* TAB or SPACE */ 311 } else { 312 /* Zs */ 313 return UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR; 314 } 315 } 316 }, 317 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_GRAPH 318 boolean contains(int c) { 319 return isgraphPOSIX(c); 320 } 321 }, 322 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_PRINT 323 boolean contains(int c) { 324 /* 325 * Checks if codepoint is in \p{graph}\p{blank} - \p{cntrl}. 326 * 327 * The only cntrl character in graph+blank is TAB (in blank). 328 * Here we implement (blank-TAB)=Zs instead of calling u_isblank(). 329 */ 330 return (UCharacter.getType(c)==UCharacter.SPACE_SEPARATOR) || isgraphPOSIX(c); 331 } 332 }, 333 new BinaryProperty(SRC_CHAR) { // UCHAR_POSIX_XDIGIT 334 boolean contains(int c) { 335 /* check ASCII and Fullwidth ASCII a-fA-F */ 336 if( 337 (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) || 338 (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41)) 339 ) { 340 return true; 341 } 342 return UCharacter.getType(c)==UCharacter.DECIMAL_DIGIT_NUMBER; 343 } 344 }, 345 new CaseBinaryProperty(UProperty.CASED), 346 new CaseBinaryProperty(UProperty.CASE_IGNORABLE), 347 new CaseBinaryProperty(UProperty.CHANGES_WHEN_LOWERCASED), 348 new CaseBinaryProperty(UProperty.CHANGES_WHEN_UPPERCASED), 349 new CaseBinaryProperty(UProperty.CHANGES_WHEN_TITLECASED), 350 new BinaryProperty(SRC_CASE_AND_NORM) { // UCHAR_CHANGES_WHEN_CASEFOLDED 351 boolean contains(int c) { 352 String nfd=Norm2AllModes.getNFCInstance().impl.getDecomposition(c); 353 if(nfd!=null) { 354 /* c has a decomposition */ 355 c=nfd.codePointAt(0); 356 if(Character.charCount(c)!=nfd.length()) { 357 /* multiple code points */ 358 c=-1; 359 } 360 } else if(c<0) { 361 return false; /* protect against bad input */ 362 } 363 if(c>=0) { 364 /* single code point */ 365 UCaseProps csp=UCaseProps.INSTANCE; 366 UCaseProps.dummyStringBuilder.setLength(0); 367 return csp.toFullFolding(c, UCaseProps.dummyStringBuilder, 368 UCharacter.FOLD_CASE_DEFAULT)>=0; 369 } else { 370 String folded=UCharacter.foldCase(nfd, true); 371 return !folded.equals(nfd); 372 } 373 } 374 }, 375 new CaseBinaryProperty(UProperty.CHANGES_WHEN_CASEMAPPED), 376 new BinaryProperty(SRC_NFKC_CF) { // UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED 377 boolean contains(int c) { 378 Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstance().impl; 379 String src=UTF16.valueOf(c); 380 StringBuilder dest=new StringBuilder(); 381 // Small destCapacity for NFKC_CF(c). 382 Normalizer2Impl.ReorderingBuffer buffer=new Normalizer2Impl.ReorderingBuffer(kcf, dest, 5); 383 kcf.compose(src, 0, src.length(), false, true, buffer); 384 return !Normalizer2Impl.UTF16Plus.equal(dest, src); 385 } 386 }, 387 }; 388 389 public boolean hasBinaryProperty(int c, int which) { 390 if(which<UProperty.BINARY_START || UProperty.BINARY_LIMIT<=which) { 391 // not a known binary property 392 return false; 393 } else { 394 return binProps[which].contains(c); 395 } 396 } 397 398 // int-value and enumerated properties --------------------------------- *** 399 400 public int getType(int c) { 401 return getProperty(c)&TYPE_MASK; 402 } 403 404 /* 405 * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. 406 * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. 407 */ 408 private static final int /* UHangulSyllableType */ gcbToHst[]={ 409 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ 410 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ 411 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ 412 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ 413 HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ 414 HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ 415 HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ 416 HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ 417 HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ 418 HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ 419 /* 420 * Omit GCB values beyond what we need for hst. 421 * The code below checks for the array length. 422 */ 423 }; 424 425 private class IntProperty { 426 int column; // SRC_PROPSVEC column, or "source" if mask==0 427 int mask; 428 int shift; 429 IntProperty(int column, int mask, int shift) { 430 this.column=column; 431 this.mask=mask; 432 this.shift=shift; 433 } 434 IntProperty(int source) { 435 this.column=source; 436 this.mask=0; 437 } 438 final int getSource() { 439 return mask==0 ? column : SRC_PROPSVEC; 440 } 441 int getValue(int c) { 442 // systematic, directly stored properties 443 return (getAdditional(c, column)&mask)>>>shift; 444 } 445 int getMaxValue(int which) { 446 return (getMaxValues(column)&mask)>>>shift; 447 } 448 } 449 450 private class BiDiIntProperty extends IntProperty { 451 BiDiIntProperty() { 452 super(SRC_BIDI); 453 } 454 int getMaxValue(int which) { 455 return UBiDiProps.INSTANCE.getMaxValue(which); 456 } 457 } 458 459 private class CombiningClassIntProperty extends IntProperty { 460 CombiningClassIntProperty(int source) { 461 super(source); 462 } 463 int getMaxValue(int which) { 464 return 0xff; 465 } 466 } 467 468 private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties 469 int which; 470 int max; 471 NormQuickCheckIntProperty(int source, int which, int max) { 472 super(source); 473 this.which=which; 474 this.max=max; 475 } 476 int getValue(int c) { 477 return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_QUICK_CHECK).getQuickCheck(c); 478 } 479 int getMaxValue(int which) { 480 return max; 481 } 482 } 483 484 IntProperty intProps[]={ 485 new BiDiIntProperty() { // BIDI_CLASS 486 int getValue(int c) { 487 return UBiDiProps.INSTANCE.getClass(c); 488 } 489 }, 490 new IntProperty(0, BLOCK_MASK_, BLOCK_SHIFT_), 491 new CombiningClassIntProperty(SRC_NFC) { // CANONICAL_COMBINING_CLASS 492 int getValue(int c) { 493 return Normalizer2.getNFDInstance().getCombiningClass(c); 494 } 495 }, 496 new IntProperty(2, DECOMPOSITION_TYPE_MASK_, 0), 497 new IntProperty(0, EAST_ASIAN_MASK_, EAST_ASIAN_SHIFT_), 498 new IntProperty(SRC_CHAR) { // GENERAL_CATEGORY 499 int getValue(int c) { 500 return getType(c); 501 } 502 int getMaxValue(int which) { 503 return UCharacterCategory.CHAR_CATEGORY_COUNT-1; 504 } 505 }, 506 new BiDiIntProperty() { // JOINING_GROUP 507 int getValue(int c) { 508 return UBiDiProps.INSTANCE.getJoiningGroup(c); 509 } 510 }, 511 new BiDiIntProperty() { // JOINING_TYPE 512 int getValue(int c) { 513 return UBiDiProps.INSTANCE.getJoiningType(c); 514 } 515 }, 516 new IntProperty(2, LB_MASK, LB_SHIFT), // LINE_BREAK 517 new IntProperty(SRC_CHAR) { // NUMERIC_TYPE 518 int getValue(int c) { 519 return ntvGetType(getNumericTypeValue(getProperty(c))); 520 } 521 int getMaxValue(int which) { 522 return NumericType.COUNT-1; 523 } 524 }, 525 new IntProperty(0, SCRIPT_MASK_, 0) { 526 int getValue(int c) { 527 return UScript.getScript(c); 528 } 529 }, 530 new IntProperty(SRC_PROPSVEC) { // HANGUL_SYLLABLE_TYPE 531 int getValue(int c) { 532 /* see comments on gcbToHst[] above */ 533 int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT; 534 if(gcb<gcbToHst.length) { 535 return gcbToHst[gcb]; 536 } else { 537 return HangulSyllableType.NOT_APPLICABLE; 538 } 539 } 540 int getMaxValue(int which) { 541 return HangulSyllableType.COUNT-1; 542 } 543 }, 544 // max=1=YES -- these are never "maybe", only "no" or "yes" 545 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFD_QUICK_CHECK, 1), 546 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKD_QUICK_CHECK, 1), 547 // max=2=MAYBE 548 new NormQuickCheckIntProperty(SRC_NFC, UProperty.NFC_QUICK_CHECK, 2), 549 new NormQuickCheckIntProperty(SRC_NFKC, UProperty.NFKC_QUICK_CHECK, 2), 550 new CombiningClassIntProperty(SRC_NFC) { // LEAD_CANONICAL_COMBINING_CLASS 551 int getValue(int c) { 552 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)>>8; 553 } 554 }, 555 new CombiningClassIntProperty(SRC_NFC) { // TRAIL_CANONICAL_COMBINING_CLASS 556 int getValue(int c) { 557 return Norm2AllModes.getNFCInstance().impl.getFCD16(c)&0xff; 558 } 559 }, 560 new IntProperty(2, GCB_MASK, GCB_SHIFT), // GRAPHEME_CLUSTER_BREAK 561 new IntProperty(2, SB_MASK, SB_SHIFT), // SENTENCE_BREAK 562 new IntProperty(2, WB_MASK, WB_SHIFT), // WORD_BREAK 563 new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE 564 int getValue(int c) { 565 return UBiDiProps.INSTANCE.getPairedBracketType(c); 566 } 567 }, 568 }; 569 570 public int getIntPropertyValue(int c, int which) { 571 if(which<UProperty.INT_START) { 572 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) { 573 return binProps[which].contains(c) ? 1 : 0; 574 } 575 } else if(which<UProperty.INT_LIMIT) { 576 return intProps[which-UProperty.INT_START].getValue(c); 577 } else if (which == UProperty.GENERAL_CATEGORY_MASK) { 578 return getMask(getType(c)); 579 } 580 return 0; // undefined 581 } 582 583 public int getIntPropertyMaxValue(int which) { 584 if(which<UProperty.INT_START) { 585 if(UProperty.BINARY_START<=which && which<UProperty.BINARY_LIMIT) { 586 return 1; // maximum TRUE for all binary properties 587 } 588 } else if(which<UProperty.INT_LIMIT) { 589 return intProps[which-UProperty.INT_START].getMaxValue(which); 590 } 591 return -1; // undefined 592 } 593 594 public final int getSource(int which) { 595 if(which<UProperty.BINARY_START) { 596 return SRC_NONE; /* undefined */ 597 } else if(which<UProperty.BINARY_LIMIT) { 598 return binProps[which].getSource(); 599 } else if(which<UProperty.INT_START) { 600 return SRC_NONE; /* undefined */ 601 } else if(which<UProperty.INT_LIMIT) { 602 return intProps[which-UProperty.INT_START].getSource(); 603 } else if(which<UProperty.STRING_START) { 604 switch(which) { 605 case UProperty.GENERAL_CATEGORY_MASK: 606 case UProperty.NUMERIC_VALUE: 607 return SRC_CHAR; 608 609 default: 610 return SRC_NONE; 611 } 612 } else if(which<UProperty.STRING_LIMIT) { 613 switch(which) { 614 case UProperty.AGE: 615 return SRC_PROPSVEC; 616 617 case UProperty.BIDI_MIRRORING_GLYPH: 618 return SRC_BIDI; 619 620 case UProperty.CASE_FOLDING: 621 case UProperty.LOWERCASE_MAPPING: 622 case UProperty.SIMPLE_CASE_FOLDING: 623 case UProperty.SIMPLE_LOWERCASE_MAPPING: 624 case UProperty.SIMPLE_TITLECASE_MAPPING: 625 case UProperty.SIMPLE_UPPERCASE_MAPPING: 626 case UProperty.TITLECASE_MAPPING: 627 case UProperty.UPPERCASE_MAPPING: 628 return SRC_CASE; 629 630 case UProperty.ISO_COMMENT: 631 case UProperty.NAME: 632 case UProperty.UNICODE_1_NAME: 633 return SRC_NAMES; 634 635 default: 636 return SRC_NONE; 637 } 638 } else { 639 switch(which) { 640 case UProperty.SCRIPT_EXTENSIONS: 641 return SRC_PROPSVEC; 642 default: 643 return SRC_NONE; /* undefined */ 644 } 645 } 646 } 647 648 /** 649 * Forms a supplementary code point from the argument character<br> 650 * Note this is for internal use hence no checks for the validity of the 651 * surrogate characters are done 652 * @param lead lead surrogate character 653 * @param trail trailing surrogate character 654 * @return code point of the supplementary character 655 */ 656 public static int getRawSupplementary(char lead, char trail) 657 { 658 return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; 659 } 660 661 /** 662 * <p> 663 * Unicode property names and property value names are compared 664 * "loosely". Property[Value]Aliases.txt say: 665 * <quote> 666 * "With loose matching of property names, the case distinctions, 667 * whitespace, and '_' are ignored." 668 * </quote> 669 * </p> 670 * <p> 671 * This function does just that, for ASCII (char *) name strings. 672 * It is almost identical to ucnv_compareNames() but also ignores 673 * ASCII White_Space characters (U+0009..U+000d). 674 * </p> 675 * @param name1 name to compare 676 * @param name2 name to compare 677 * @return 0 if names are equal, < 0 if name1 is less than name2 and > 0 678 * if name1 is greater than name2. 679 */ 680 /* to be implemented in 2.4 681 * public static int comparePropertyNames(String name1, String name2) 682 { 683 int result = 0; 684 int i1 = 0; 685 int i2 = 0; 686 while (true) { 687 char ch1 = 0; 688 char ch2 = 0; 689 // Ignore delimiters '-', '_', and ASCII White_Space 690 if (i1 < name1.length()) { 691 ch1 = name1.charAt(i1 ++); 692 } 693 while (ch1 == '-' || ch1 == '_' || ch1 == ' ' || ch1 == '\t' 694 || ch1 == '\n' // synwee what is || ch1 == '\v' 695 || ch1 == '\f' || ch1=='\r') { 696 if (i1 < name1.length()) { 697 ch1 = name1.charAt(i1 ++); 698 } 699 else { 700 ch1 = 0; 701 } 702 } 703 if (i2 < name2.length()) { 704 ch2 = name2.charAt(i2 ++); 705 } 706 while (ch2 == '-' || ch2 == '_' || ch2 == ' ' || ch2 == '\t' 707 || ch2 == '\n' // synwee what is || ch1 == '\v' 708 || ch2 == '\f' || ch2=='\r') { 709 if (i2 < name2.length()) { 710 ch2 = name2.charAt(i2 ++); 711 } 712 else { 713 ch2 = 0; 714 } 715 } 716 717 // If we reach the ends of both strings then they match 718 if (ch1 == 0 && ch2 == 0) { 719 return 0; 720 } 721 722 // Case-insensitive comparison 723 if (ch1 != ch2) { 724 result = Character.toLowerCase(ch1) 725 - Character.toLowerCase(ch2); 726 if (result != 0) { 727 return result; 728 } 729 } 730 } 731 } 732 */ 733 734 /** 735 * Get the the maximum values for some enum/int properties. 736 * @return maximum values for the integer properties. 737 */ 738 public int getMaxValues(int column) 739 { 740 // return m_maxBlockScriptValue_; 741 742 switch(column) { 743 case 0: 744 return m_maxBlockScriptValue_; 745 case 2: 746 return m_maxJTGValue_; 747 default: 748 return 0; 749 } 750 } 751 752 /** 753 * Gets the type mask 754 * @param type character type 755 * @return mask 756 */ 757 public static final int getMask(int type) 758 { 759 return 1 << type; 760 } 761 762 763 /** 764 * Returns the digit values of characters like 'A' - 'Z', normal, 765 * half-width and full-width. This method assumes that the other digit 766 * characters are checked by the calling method. 767 * @param ch character to test 768 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise 769 * its corresponding digit will be returned. 770 */ 771 public static int getEuropeanDigit(int ch) { 772 if ((ch > 0x7a && ch < 0xff21) 773 || ch < 0x41 || (ch > 0x5a && ch < 0x61) 774 || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { 775 return -1; 776 } 777 if (ch <= 0x7a) { 778 // ch >= 0x41 or ch < 0x61 779 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); 780 } 781 // ch >= 0xff21 782 if (ch <= 0xff3a) { 783 return ch + 10 - 0xff21; 784 } 785 // ch >= 0xff41 && ch <= 0xff5a 786 return ch + 10 - 0xff41; 787 } 788 789 public int digit(int c) { 790 int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; 791 if(value<=9) { 792 return value; 793 } else { 794 return -1; 795 } 796 } 797 798 public int getNumericValue(int c) { 799 // slightly pruned version of getUnicodeNumericValue(), plus getEuropeanDigit() 800 int ntv = getNumericTypeValue(getProperty(c)); 801 802 if(ntv==NTV_NONE_) { 803 return getEuropeanDigit(c); 804 } else if(ntv<NTV_DIGIT_START_) { 805 /* decimal digit */ 806 return ntv-NTV_DECIMAL_START_; 807 } else if(ntv<NTV_NUMERIC_START_) { 808 /* other digit */ 809 return ntv-NTV_DIGIT_START_; 810 } else if(ntv<NTV_FRACTION_START_) { 811 /* small integer */ 812 return ntv-NTV_NUMERIC_START_; 813 } else if(ntv<NTV_LARGE_START_) { 814 /* fraction */ 815 return -2; 816 } else if(ntv<NTV_BASE60_START_) { 817 /* large, single-significant-digit integer */ 818 int mant=(ntv>>5)-14; 819 int exp=(ntv&0x1f)+2; 820 if(exp<9 || (exp==9 && mant<=2)) { 821 int numValue=mant; 822 do { 823 numValue*=10; 824 } while(--exp>0); 825 return numValue; 826 } else { 827 return -2; 828 } 829 } else if(ntv<NTV_RESERVED_START_) { 830 /* sexagesimal (base 60) integer */ 831 int numValue=(ntv>>2)-0xbf; 832 int exp=(ntv&3)+1; 833 834 switch(exp) { 835 case 4: 836 numValue*=60*60*60*60; 837 break; 838 case 3: 839 numValue*=60*60*60; 840 break; 841 case 2: 842 numValue*=60*60; 843 break; 844 case 1: 845 numValue*=60; 846 break; 847 case 0: 848 default: 849 break; 850 } 851 852 return numValue; 853 } else { 854 /* reserved */ 855 return -2; 856 } 857 } 858 859 public double getUnicodeNumericValue(int c) { 860 // equivalent to c version double u_getNumericValue(UChar32 c) 861 int ntv = getNumericTypeValue(getProperty(c)); 862 863 if(ntv==NTV_NONE_) { 864 return UCharacter.NO_NUMERIC_VALUE; 865 } else if(ntv<NTV_DIGIT_START_) { 866 /* decimal digit */ 867 return ntv-NTV_DECIMAL_START_; 868 } else if(ntv<NTV_NUMERIC_START_) { 869 /* other digit */ 870 return ntv-NTV_DIGIT_START_; 871 } else if(ntv<NTV_FRACTION_START_) { 872 /* small integer */ 873 return ntv-NTV_NUMERIC_START_; 874 } else if(ntv<NTV_LARGE_START_) { 875 /* fraction */ 876 int numerator=(ntv>>4)-12; 877 int denominator=(ntv&0xf)+1; 878 return (double)numerator/denominator; 879 } else if(ntv<NTV_BASE60_START_) { 880 /* large, single-significant-digit integer */ 881 double numValue; 882 int mant=(ntv>>5)-14; 883 int exp=(ntv&0x1f)+2; 884 numValue=mant; 885 886 /* multiply by 10^exp without math.h */ 887 while(exp>=4) { 888 numValue*=10000.; 889 exp-=4; 890 } 891 switch(exp) { 892 case 3: 893 numValue*=1000.; 894 break; 895 case 2: 896 numValue*=100.; 897 break; 898 case 1: 899 numValue*=10.; 900 break; 901 case 0: 902 default: 903 break; 904 } 905 906 return numValue; 907 } else if(ntv<NTV_RESERVED_START_) { 908 /* sexagesimal (base 60) integer */ 909 int numValue=(ntv>>2)-0xbf; 910 int exp=(ntv&3)+1; 911 912 switch(exp) { 913 case 4: 914 numValue*=60*60*60*60; 915 break; 916 case 3: 917 numValue*=60*60*60; 918 break; 919 case 2: 920 numValue*=60*60; 921 break; 922 case 1: 923 numValue*=60; 924 break; 925 case 0: 926 default: 927 break; 928 } 929 930 return numValue; 931 } else { 932 /* reserved */ 933 return UCharacter.NO_NUMERIC_VALUE; 934 } 935 } 936 937 // protected variables ----------------------------------------------- 938 939 /** 940 * Extra property trie 941 */ 942 Trie2_16 m_additionalTrie_; 943 /** 944 * Extra property vectors, 1st column for age and second for binary 945 * properties. 946 */ 947 int m_additionalVectors_[]; 948 /** 949 * Number of additional columns 950 */ 951 int m_additionalColumnsCount_; 952 /** 953 * Maximum values for block, bits used as in vector word 954 * 0 955 */ 956 int m_maxBlockScriptValue_; 957 /** 958 * Maximum values for script, bits used as in vector word 959 * 0 960 */ 961 int m_maxJTGValue_; 962 963 /** 964 * Script_Extensions data 965 */ 966 public char[] m_scriptExtensions_; 967 968 // private variables ------------------------------------------------- 969 970 /** 971 * Default name of the datafile 972 */ 973 private static final String DATA_FILE_NAME_ = "uprops.icu"; 974 975 /** 976 * Shift value for lead surrogate to form a supplementary character. 977 */ 978 private static final int LEAD_SURROGATE_SHIFT_ = 10; 979 /** 980 * Offset to add to combined surrogate pair to avoid masking. 981 */ 982 private static final int SURROGATE_OFFSET_ = 983 UTF16.SUPPLEMENTARY_MIN_VALUE - 984 (UTF16.SURROGATE_MIN_VALUE << 985 LEAD_SURROGATE_SHIFT_) - 986 UTF16.TRAIL_SURROGATE_MIN_VALUE; 987 988 989 // property data constants ------------------------------------------------- 990 991 /** 992 * Numeric types and values in the main properties words. 993 */ 994 private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; 995 private static final int getNumericTypeValue(int props) { 996 return props >> NUMERIC_TYPE_VALUE_SHIFT_; 997 } 998 /* constants for the storage form of numeric types and values */ 999 /** No numeric value. */ 1000 private static final int NTV_NONE_ = 0; 1001 /** Decimal digits: nv=0..9 */ 1002 private static final int NTV_DECIMAL_START_ = 1; 1003 /** Other digits: nv=0..9 */ 1004 private static final int NTV_DIGIT_START_ = 11; 1005 /** Small integers: nv=0..154 */ 1006 private static final int NTV_NUMERIC_START_ = 21; 1007 /** Fractions: ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16 */ 1008 private static final int NTV_FRACTION_START_ = 0xb0; 1009 /** 1010 * Large integers: 1011 * ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33) 1012 * (only one significant decimal digit) 1013 */ 1014 private static final int NTV_LARGE_START_ = 0x1e0; 1015 /** 1016 * Sexagesimal numbers: 1017 * ((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4) 1018 */ 1019 private static final int NTV_BASE60_START_=0x300; 1020 /** No numeric value (yet). */ 1021 private static final int NTV_RESERVED_START_ = NTV_BASE60_START_ + 36; // 0x300+9*4=0x324 1022 1023 private static final int ntvGetType(int ntv) { 1024 return 1025 (ntv==NTV_NONE_) ? NumericType.NONE : 1026 (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL : 1027 (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT : 1028 NumericType.NUMERIC; 1029 } 1030 1031 /* 1032 * Properties in vector word 0 1033 * Bits 1034 * 31..24 DerivedAge version major/minor one nibble each 1035 * 23..22 3..1: Bits 7..0 = Script_Extensions index 1036 * 3: Script value from Script_Extensions 1037 * 2: Script=Inherited 1038 * 1: Script=Common 1039 * 0: Script=bits 7..0 1040 * 21..20 reserved 1041 * 19..17 East Asian Width 1042 * 16.. 8 UBlockCode 1043 * 7.. 0 UScriptCode 1044 */ 1045 1046 /** 1047 * Script_Extensions: mask includes Script 1048 */ 1049 public static final int SCRIPT_X_MASK = 0x00c000ff; 1050 //private static final int SCRIPT_X_SHIFT = 22; 1051 /** 1052 * Integer properties mask and shift values for East Asian cell width. 1053 * Equivalent to icu4c UPROPS_EA_MASK 1054 */ 1055 private static final int EAST_ASIAN_MASK_ = 0x000e0000; 1056 /** 1057 * Integer properties mask and shift values for East Asian cell width. 1058 * Equivalent to icu4c UPROPS_EA_SHIFT 1059 */ 1060 private static final int EAST_ASIAN_SHIFT_ = 17; 1061 /** 1062 * Integer properties mask and shift values for blocks. 1063 * Equivalent to icu4c UPROPS_BLOCK_MASK 1064 */ 1065 private static final int BLOCK_MASK_ = 0x0001ff00; 1066 /** 1067 * Integer properties mask and shift values for blocks. 1068 * Equivalent to icu4c UPROPS_BLOCK_SHIFT 1069 */ 1070 private static final int BLOCK_SHIFT_ = 8; 1071 /** 1072 * Integer properties mask and shift values for scripts. 1073 * Equivalent to icu4c UPROPS_SHIFT_MASK 1074 */ 1075 public static final int SCRIPT_MASK_ = 0x000000ff; 1076 1077 /* SCRIPT_X_WITH_COMMON must be the lowest value that involves Script_Extensions. */ 1078 public static final int SCRIPT_X_WITH_COMMON = 0x400000; 1079 public static final int SCRIPT_X_WITH_INHERITED = 0x800000; 1080 public static final int SCRIPT_X_WITH_OTHER = 0xc00000; 1081 1082 /** 1083 * Additional properties used in internal trie data 1084 */ 1085 /* 1086 * Properties in vector word 1 1087 * Each bit encodes one binary property. 1088 * The following constants represent the bit number, use 1<<UPROPS_XYZ. 1089 * UPROPS_BINARY_1_TOP<=32! 1090 * 1091 * Keep this list of property enums in sync with 1092 * propListNames[] in icu/source/tools/genprops/props2.c! 1093 * 1094 * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". 1095 */ 1096 private static final int WHITE_SPACE_PROPERTY_ = 0; 1097 private static final int DASH_PROPERTY_ = 1; 1098 private static final int HYPHEN_PROPERTY_ = 2; 1099 private static final int QUOTATION_MARK_PROPERTY_ = 3; 1100 private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4; 1101 private static final int MATH_PROPERTY_ = 5; 1102 private static final int HEX_DIGIT_PROPERTY_ = 6; 1103 private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7; 1104 private static final int ALPHABETIC_PROPERTY_ = 8; 1105 private static final int IDEOGRAPHIC_PROPERTY_ = 9; 1106 private static final int DIACRITIC_PROPERTY_ = 10; 1107 private static final int EXTENDER_PROPERTY_ = 11; 1108 private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12; 1109 private static final int GRAPHEME_EXTEND_PROPERTY_ = 13; 1110 private static final int GRAPHEME_LINK_PROPERTY_ = 14; 1111 private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15; 1112 private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16; 1113 private static final int RADICAL_PROPERTY_ = 17; 1114 private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18; 1115 private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19; 1116 private static final int DEPRECATED_PROPERTY_ = 20; 1117 private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21; 1118 private static final int XID_START_PROPERTY_ = 22; 1119 private static final int XID_CONTINUE_PROPERTY_ = 23; 1120 private static final int ID_START_PROPERTY_ = 24; 1121 private static final int ID_CONTINUE_PROPERTY_ = 25; 1122 private static final int GRAPHEME_BASE_PROPERTY_ = 26; 1123 private static final int S_TERM_PROPERTY_ = 27; 1124 private static final int VARIATION_SELECTOR_PROPERTY_ = 28; 1125 private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */ 1126 private static final int PATTERN_WHITE_SPACE = 30; 1127 1128 /* 1129 * Properties in vector word 2 1130 * Bits 1131 * 31..26 reserved 1132 * 25..20 Line Break 1133 * 19..15 Sentence Break 1134 * 14..10 Word Break 1135 * 9.. 5 Grapheme Cluster Break 1136 * 4.. 0 Decomposition Type 1137 */ 1138 private static final int LB_MASK = 0x03f00000; 1139 private static final int LB_SHIFT = 20; 1140 1141 private static final int SB_MASK = 0x000f8000; 1142 private static final int SB_SHIFT = 15; 1143 1144 private static final int WB_MASK = 0x00007c00; 1145 private static final int WB_SHIFT = 10; 1146 1147 private static final int GCB_MASK = 0x000003e0; 1148 private static final int GCB_SHIFT = 5; 1149 1150 /** 1151 * Integer properties mask for decomposition type. 1152 * Equivalent to icu4c UPROPS_DT_MASK. 1153 */ 1154 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; 1155 1156 /** 1157 * First nibble shift 1158 */ 1159 private static final int FIRST_NIBBLE_SHIFT_ = 0x4; 1160 /** 1161 * Second nibble mask 1162 */ 1163 private static final int LAST_NIBBLE_MASK_ = 0xF; 1164 /** 1165 * Age value shift 1166 */ 1167 private static final int AGE_SHIFT_ = 24; 1168 1169 1170 // private constructors -------------------------------------------------- 1171 1172 /** 1173 * Constructor 1174 * @exception IOException thrown when data reading fails or data corrupted 1175 */ 1176 private UCharacterProperty() throws IOException 1177 { 1178 // consistency check 1179 if(binProps.length!=UProperty.BINARY_LIMIT) { 1180 throw new ICUException("binProps.length!=UProperty.BINARY_LIMIT"); 1181 } 1182 if(intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)) { 1183 throw new ICUException("intProps.length!=(UProperty.INT_LIMIT-UProperty.INT_START)"); 1184 } 1185 1186 // jar access 1187 ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_); 1188 m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable()); 1189 // Read or skip the 16 indexes. 1190 int propertyOffset = bytes.getInt(); 1191 /* exceptionOffset = */ bytes.getInt(); 1192 /* caseOffset = */ bytes.getInt(); 1193 int additionalOffset = bytes.getInt(); 1194 int additionalVectorsOffset = bytes.getInt(); 1195 m_additionalColumnsCount_ = bytes.getInt(); 1196 int scriptExtensionsOffset = bytes.getInt(); 1197 int reservedOffset7 = bytes.getInt(); 1198 /* reservedOffset8 = */ bytes.getInt(); 1199 /* dataTopOffset = */ bytes.getInt(); 1200 m_maxBlockScriptValue_ = bytes.getInt(); 1201 m_maxJTGValue_ = bytes.getInt(); 1202 ICUBinary.skipBytes(bytes, (16 - 12) << 2); 1203 1204 // read the main properties trie 1205 m_trie_ = Trie2_16.createFromSerialized(bytes); 1206 int expectedTrieLength = (propertyOffset - 16) * 4; 1207 int trieLength = m_trie_.getSerializedLength(); 1208 if(trieLength > expectedTrieLength) { 1209 throw new IOException("uprops.icu: not enough bytes for main trie"); 1210 } 1211 // skip padding after trie bytes 1212 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 1213 1214 // skip unused intervening data structures 1215 ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); 1216 1217 if(m_additionalColumnsCount_ > 0) { 1218 // reads the additional property block 1219 m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); 1220 expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; 1221 trieLength = m_additionalTrie_.getSerializedLength(); 1222 if(trieLength > expectedTrieLength) { 1223 throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); 1224 } 1225 // skip padding after trie bytes 1226 ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); 1227 1228 // additional properties 1229 int size = scriptExtensionsOffset - additionalVectorsOffset; 1230 m_additionalVectors_ = new int[size]; 1231 for (int i = 0; i < size; i ++) { 1232 m_additionalVectors_[i] = bytes.getInt(); 1233 } 1234 } 1235 1236 // Script_Extensions 1237 int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; 1238 if(numChars > 0) { 1239 m_scriptExtensions_ = new char[numChars]; 1240 for(int i = 0; i < numChars; ++i) { 1241 m_scriptExtensions_[i] = bytes.getChar(); 1242 } 1243 } 1244 } 1245 1246 private static final class IsAcceptable implements ICUBinary.Authenticate { 1247 // @Override when we switch to Java 6 1248 public boolean isDataVersionAcceptable(byte version[]) { 1249 return version[0] == 7; 1250 } 1251 } 1252 private static final int DATA_FORMAT = 0x5550726F; // "UPro" 1253 1254 // private methods ------------------------------------------------------- 1255 1256 /* 1257 * Compare additional properties to see if it has argument type 1258 * @param property 32 bit properties 1259 * @param type character type 1260 * @return true if property has type 1261 */ 1262 /*private boolean compareAdditionalType(int property, int type) 1263 { 1264 return (property & (1 << type)) != 0; 1265 }*/ 1266 1267 // property starts for UnicodeSet -------------------------------------- *** 1268 1269 private static final int TAB = 0x0009; 1270 //private static final int LF = 0x000a; 1271 //private static final int FF = 0x000c; 1272 private static final int CR = 0x000d; 1273 private static final int U_A = 0x0041; 1274 private static final int U_F = 0x0046; 1275 private static final int U_Z = 0x005a; 1276 private static final int U_a = 0x0061; 1277 private static final int U_f = 0x0066; 1278 private static final int U_z = 0x007a; 1279 private static final int DEL = 0x007f; 1280 private static final int NL = 0x0085; 1281 private static final int NBSP = 0x00a0; 1282 private static final int CGJ = 0x034f; 1283 private static final int FIGURESP= 0x2007; 1284 private static final int HAIRSP = 0x200a; 1285 //private static final int ZWNJ = 0x200c; 1286 //private static final int ZWJ = 0x200d; 1287 private static final int RLM = 0x200f; 1288 private static final int NNBSP = 0x202f; 1289 private static final int WJ = 0x2060; 1290 private static final int INHSWAP = 0x206a; 1291 private static final int NOMDIG = 0x206f; 1292 private static final int U_FW_A = 0xff21; 1293 private static final int U_FW_F = 0xff26; 1294 private static final int U_FW_Z = 0xff3a; 1295 private static final int U_FW_a = 0xff41; 1296 private static final int U_FW_f = 0xff46; 1297 private static final int U_FW_z = 0xff5a; 1298 private static final int ZWNBSP = 0xfeff; 1299 1300 public UnicodeSet addPropertyStarts(UnicodeSet set) { 1301 /* add the start code point of each same-value range of the main trie */ 1302 Iterator<Trie2.Range> trieIterator = m_trie_.iterator(); 1303 Trie2.Range range; 1304 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 1305 set.add(range.startCodePoint); 1306 } 1307 1308 /* add code points with hardcoded properties, plus the ones following them */ 1309 1310 /* add for u_isblank() */ 1311 set.add(TAB); 1312 set.add(TAB+1); 1313 1314 /* add for IS_THAT_CONTROL_SPACE() */ 1315 set.add(CR+1); /* range TAB..CR */ 1316 set.add(0x1c); 1317 set.add(0x1f+1); 1318 set.add(NL); 1319 set.add(NL+1); 1320 1321 /* add for u_isIDIgnorable() what was not added above */ 1322 set.add(DEL); /* range DEL..NBSP-1, NBSP added below */ 1323 set.add(HAIRSP); 1324 set.add(RLM+1); 1325 set.add(INHSWAP); 1326 set.add(NOMDIG+1); 1327 set.add(ZWNBSP); 1328 set.add(ZWNBSP+1); 1329 1330 /* add no-break spaces for u_isWhitespace() what was not added above */ 1331 set.add(NBSP); 1332 set.add(NBSP+1); 1333 set.add(FIGURESP); 1334 set.add(FIGURESP+1); 1335 set.add(NNBSP); 1336 set.add(NNBSP+1); 1337 1338 /* add for u_charDigitValue() */ 1339 // TODO remove when UCharacter.getHanNumericValue() is changed to just return 1340 // Unicode numeric values 1341 set.add(0x3007); 1342 set.add(0x3008); 1343 set.add(0x4e00); 1344 set.add(0x4e01); 1345 set.add(0x4e8c); 1346 set.add(0x4e8d); 1347 set.add(0x4e09); 1348 set.add(0x4e0a); 1349 set.add(0x56db); 1350 set.add(0x56dc); 1351 set.add(0x4e94); 1352 set.add(0x4e95); 1353 set.add(0x516d); 1354 set.add(0x516e); 1355 set.add(0x4e03); 1356 set.add(0x4e04); 1357 set.add(0x516b); 1358 set.add(0x516c); 1359 set.add(0x4e5d); 1360 set.add(0x4e5e); 1361 1362 /* add for u_digit() */ 1363 set.add(U_a); 1364 set.add(U_z+1); 1365 set.add(U_A); 1366 set.add(U_Z+1); 1367 set.add(U_FW_a); 1368 set.add(U_FW_z+1); 1369 set.add(U_FW_A); 1370 set.add(U_FW_Z+1); 1371 1372 /* add for u_isxdigit() */ 1373 set.add(U_f+1); 1374 set.add(U_F+1); 1375 set.add(U_FW_f+1); 1376 set.add(U_FW_F+1); 1377 1378 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ 1379 set.add(WJ); /* range WJ..NOMDIG */ 1380 set.add(0xfff0); 1381 set.add(0xfffb+1); 1382 set.add(0xe0000); 1383 set.add(0xe0fff+1); 1384 1385 /* add for UCHAR_GRAPHEME_BASE and others */ 1386 set.add(CGJ); 1387 set.add(CGJ+1); 1388 1389 return set; // for chaining 1390 } 1391 1392 public void upropsvec_addPropertyStarts(UnicodeSet set) { 1393 /* add the start code point of each same-value range of the properties vectors trie */ 1394 if(m_additionalColumnsCount_>0) { 1395 /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ 1396 Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator(); 1397 Trie2.Range range; 1398 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 1399 set.add(range.startCodePoint); 1400 } 1401 } 1402 } 1403 1404 // This static initializer block must be placed after 1405 // other static member initialization 1406 static { 1407 try { 1408 INSTANCE = new UCharacterProperty(); 1409 } 1410 catch (IOException e) { 1411 throw new MissingResourceException(e.getMessage(),"",""); 1412 } 1413 } 1414 1415 /*---------------------------------------------------------------- 1416 * Inclusions list 1417 *----------------------------------------------------------------*/ 1418 1419 /* 1420 * Return a set of characters for property enumeration. 1421 * The set implicitly contains 0x110000 as well, which is one more than the highest 1422 * Unicode code point. 1423 * 1424 * This set is used as an ordered list - its code points are ordered, and 1425 * consecutive code points (in Unicode code point order) in the set define a range. 1426 * For each two consecutive characters (start, limit) in the set, 1427 * all of the UCD/normalization and related properties for 1428 * all code points start..limit-1 are all the same, 1429 * except for character names and ISO comments. 1430 * 1431 * All Unicode code points U+0000..U+10ffff are covered by these ranges. 1432 * The ranges define a partition of the Unicode code space. 1433 * ICU uses the inclusions set to enumerate properties for generating 1434 * UnicodeSets containing all code points that have a certain property value. 1435 * 1436 * The Inclusion List is generated from the UCD. It is generated 1437 * by enumerating the data tries, and code points for hardcoded properties 1438 * are added as well. 1439 * 1440 * -------------------------------------------------------------------------- 1441 * 1442 * The following are ideas for getting properties-unique code point ranges, 1443 * with possible optimizations beyond the current implementation. 1444 * These optimizations would require more code and be more fragile. 1445 * The current implementation generates one single list (set) for all properties. 1446 * 1447 * To enumerate properties efficiently, one needs to know ranges of 1448 * repetitive values, so that the value of only each start code point 1449 * can be applied to the whole range. 1450 * This information is in principle available in the uprops.icu/unorm.icu data. 1451 * 1452 * There are two obstacles: 1453 * 1454 * 1. Some properties are computed from multiple data structures, 1455 * making it necessary to get repetitive ranges by intersecting 1456 * ranges from multiple tries. 1457 * 1458 * 2. It is not economical to write code for getting repetitive ranges 1459 * that are precise for each of some 50 properties. 1460 * 1461 * Compromise ideas: 1462 * 1463 * - Get ranges per trie, not per individual property. 1464 * Each range contains the same values for a whole group of properties. 1465 * This would generate currently five range sets, two for uprops.icu tries 1466 * and three for unorm.icu tries. 1467 * 1468 * - Combine sets of ranges for multiple tries to get sufficient sets 1469 * for properties, e.g., the uprops.icu main and auxiliary tries 1470 * for all non-normalization properties. 1471 * 1472 * Ideas for representing ranges and combining them: 1473 * 1474 * - A UnicodeSet could hold just the start code points of ranges. 1475 * Multiple sets are easily combined by or-ing them together. 1476 * 1477 * - Alternatively, a UnicodeSet could hold each even-numbered range. 1478 * All ranges could be enumerated by using each start code point 1479 * (for the even-numbered ranges) as well as each limit (end+1) code point 1480 * (for the odd-numbered ranges). 1481 * It should be possible to combine two such sets by xor-ing them, 1482 * but no more than two. 1483 * 1484 * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays, 1485 * but the first one is certainly simpler and applicable for combining more than 1486 * two range sets. 1487 * 1488 * It is possible to combine all range sets for all uprops/unorm tries into one 1489 * set that can be used for all properties. 1490 * As an optimization, there could be less-combined range sets for certain 1491 * groups of properties. 1492 * The relationship of which less-combined range set to use for which property 1493 * depends on the implementation of the properties and must be hardcoded 1494 * - somewhat error-prone and higher maintenance but can be tested easily 1495 * by building property sets "the simple way" in test code. 1496 * 1497 * --- 1498 * 1499 * Do not use a UnicodeSet pattern because that causes infinite recursion; 1500 * UnicodeSet depends on the inclusions set. 1501 * 1502 * --- 1503 * 1504 * getInclusions() is commented out starting 2005-feb-12 because 1505 * UnicodeSet now calls the uxyz_addPropertyStarts() directly, 1506 * and only for the relevant property source. 1507 */ 1508 /* 1509 public UnicodeSet getInclusions() { 1510 UnicodeSet set = new UnicodeSet(); 1511 NormalizerImpl.addPropertyStarts(set); 1512 addPropertyStarts(set); 1513 return set; 1514 } 1515 */ 1516 } 1517