1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2007, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * file name: unisetspan.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2007mar01 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __UNISETSPAN_H__ 20 #define __UNISETSPAN_H__ 21 22 #include "unicode/utypes.h" 23 #include "unicode/uniset.h" 24 25 U_NAMESPACE_BEGIN 26 27 /* 28 * Implement span() etc. for a set with strings. 29 * Avoid recursion because of its exponential complexity. 30 * Instead, try multiple paths at once and track them with an IndexList. 31 */ 32 class UnicodeSetStringSpan : public UMemory { 33 public: 34 /* 35 * Which span() variant will be used? 36 * The object is either built for one variant and used once, 37 * or built for all and may be used many times. 38 */ 39 enum { 40 FWD = 0x20, 41 BACK = 0x10, 42 UTF16 = 8, 43 UTF8 = 4, 44 CONTAINED = 2, 45 NOT_CONTAINED = 1, 46 47 ALL = 0x3f, 48 49 FWD_UTF16_CONTAINED = FWD | UTF16 | CONTAINED, 50 FWD_UTF16_NOT_CONTAINED = FWD | UTF16 | NOT_CONTAINED, 51 FWD_UTF8_CONTAINED = FWD | UTF8 | CONTAINED, 52 FWD_UTF8_NOT_CONTAINED = FWD | UTF8 | NOT_CONTAINED, 53 BACK_UTF16_CONTAINED = BACK | UTF16 | CONTAINED, 54 BACK_UTF16_NOT_CONTAINED= BACK | UTF16 | NOT_CONTAINED, 55 BACK_UTF8_CONTAINED = BACK | UTF8 | CONTAINED, 56 BACK_UTF8_NOT_CONTAINED = BACK | UTF8 | NOT_CONTAINED 57 }; 58 59 UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings, uint32_t which); 60 61 // Copy constructor. Assumes which==ALL for a frozen set. 62 UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, const UVector &newParentSetStrings); 63 64 ~UnicodeSetStringSpan(); 65 66 /* 67 * Do the strings need to be checked in span() etc.? 68 * @return TRUE if strings need to be checked (call span() here), 69 * FALSE if not (use a BMPSet for best performance). 70 */ 71 inline UBool needsStringSpanUTF16(); 72 inline UBool needsStringSpanUTF8(); 73 74 // For fast UnicodeSet::contains(c). 75 inline UBool contains(UChar32 c) const; 76 77 int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const; 78 79 int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const; 80 81 int32_t spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const; 82 83 int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const; 84 85 private: 86 // Special spanLength byte values. 87 enum { 88 // The spanLength is >=0xfe. 89 LONG_SPAN=0xfe, 90 // All code points in the string are contained in the parent set. 91 ALL_CP_CONTAINED=0xff 92 }; 93 94 // Add a starting or ending string character to the spanNotSet 95 // so that a character span ends before any string. 96 void addToSpanNotSet(UChar32 c); 97 98 int32_t spanNot(const UChar *s, int32_t length) const; 99 int32_t spanNotBack(const UChar *s, int32_t length) const; 100 int32_t spanNotUTF8(const uint8_t *s, int32_t length) const; 101 int32_t spanNotBackUTF8(const uint8_t *s, int32_t length) const; 102 103 // Set for span(). Same as parent but without strings. 104 UnicodeSet spanSet; 105 106 // Set for span(not contained). 107 // Same as spanSet, plus characters that start or end strings. 108 UnicodeSet *pSpanNotSet; 109 110 // The strings of the parent set. 111 const UVector &strings; 112 113 // Pointer to the UTF-8 string lengths. 114 // Also pointer to further allocated storage for meta data and 115 // UTF-8 string contents as necessary. 116 int32_t *utf8Lengths; 117 118 // Pointer to the part of the (utf8Lengths) memory block that stores 119 // the lengths of span(), spanBack() etc. for each string. 120 uint8_t *spanLengths; 121 122 // Pointer to the part of the (utf8Lengths) memory block that stores 123 // the UTF-8 versions of the parent set's strings. 124 uint8_t *utf8; 125 126 // Number of bytes for all UTF-8 versions of strings together. 127 int32_t utf8Length; 128 129 // Maximum lengths of relevant strings. 130 int32_t maxLength16; 131 int32_t maxLength8; 132 133 // Set up for all variants of span()? 134 UBool all; 135 136 // Memory for small numbers and lengths of strings. 137 // For example, for 8 strings: 138 // 8 UTF-8 lengths, 8*4 bytes span lengths, 8*2 3-byte UTF-8 characters 139 // = 112 bytes = int32_t[28]. 140 int32_t staticLengths[32]; 141 }; 142 143 UBool UnicodeSetStringSpan::needsStringSpanUTF16() { 144 return (UBool)(maxLength16!=0); 145 } 146 147 UBool UnicodeSetStringSpan::needsStringSpanUTF8() { 148 return (UBool)(maxLength8!=0); 149 } 150 151 UBool UnicodeSetStringSpan::contains(UChar32 c) const { 152 return spanSet.contains(c); 153 } 154 155 U_NAMESPACE_END 156 157 #endif 158