1 /* 2 ********************************************************************** 3 * Copyright (C) 2007, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: bitset.cpp 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2007jan15 12 * created by: Markus Scherer 13 * 14 * Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet 15 * using a folded bit set consisting of a 1k-entry index table and a 16 * compacted array of 64-bit words. 17 * Uses a simple hash table for compaction. 18 * Uses the original set for supplementary code points. 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicont.h" 23 24 /* 25 * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point. 26 * Hashes 64-bit words and maps them to 16-bit integers which are 27 * assigned in order of new incoming words for subsequent storage 28 * in a contiguous array. 29 */ 30 struct BMPBitHash : public UObject { 31 int64_t keys[0x800]; // 2k 32 uint16_t values[0x800]; 33 uint16_t reverse[0x400]; 34 uint16_t count; 35 const int32_t prime=1301; // Less than 2k. 36 37 BMPBitHash() : count(0) { 38 // Fill values[] with 0xffff. 39 uprv_memset(values, 0xff, sizeof(values)); 40 } 41 42 /* 43 * Map a key to an integer count. 44 * Map at most 1k=0x400 different keys with this data structure. 45 */ 46 uint16_t map(int64_t key) { 47 int32_t hash=(int32_t)(key>>55)&0x1ff; 48 hash^=(int32_t)(key>>44)&0x7ff; 49 hash^=(int32_t)(key>>33)&0x7ff; 50 hash^=(int32_t)(key>>22)&0x7ff; 51 hash^=(int32_t)(key>>11)&0x7ff; 52 hash^=(int32_t)key&0x7ff; 53 for(;;) { 54 if(values[hash]==0xffff) { 55 // Unused slot. 56 keys[hash]=key; 57 reverse[count]=hash; 58 return values[hash]=count++; 59 } else if(keys[hash]==key) { 60 // Found a slot with this key. 61 return values[hash]; 62 } else { 63 // Used slot with a different key, move to another slot. 64 hash=(hash+prime)&0x7ff; 65 } 66 } 67 } 68 69 uint16_t countKeys() const { return count; } 70 71 /* 72 * Invert the hash map: Fill an array of length countKeys() with the keys 73 * indexed by their mapped values. 74 */ 75 void invert(int64_t *k) const { 76 uint16_t i; 77 78 for(i=0; i<count; ++i) { 79 k[i]=keys[reverse[i]]; 80 } 81 } 82 }; 83 84 class BitSet : public UObject, public UnicodeContainable { 85 public: 86 BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) { 87 if(U_FAILURE(errorCode)) { 88 return; 89 } 90 BMPBitHash *bitHash=new BMPBitHash; 91 if(bitHash==NULL || restSet==NULL) { 92 errorCode=U_MEMORY_ALLOCATION_ERROR; 93 return; 94 } 95 96 UnicodeSetIterator iter(set); 97 int64_t b; 98 UChar32 start, end; 99 int32_t prevIndex, i, j; 100 101 b=0; // Not necessary but makes compilers happy. 102 prevIndex=-1; 103 for(;;) { 104 if(iter.nextRange() && !iter.isString()) { 105 start=iter.getCodepoint(); 106 end=iter.getCodepointEnd(); 107 } else { 108 start=0x10000; 109 } 110 i=start>>6; 111 if(prevIndex!=i) { 112 // Finish the end of the previous range. 113 if(prevIndex<0) { 114 prevIndex=0; 115 } else { 116 index[prevIndex++]=bitHash->map(b); 117 } 118 // Fill all-zero entries between ranges. 119 if(prevIndex<i) { 120 uint16_t zero=bitHash->map(0); 121 do { 122 index[prevIndex++]=zero; 123 } while(prevIndex<i); 124 } 125 b=0; 126 } 127 if(start>0xffff) { 128 break; 129 } 130 b|=~((INT64_C(1)<<(start&0x3f))-1); 131 j=end>>6; 132 if(i<j) { 133 // Set bits for the start of the range. 134 index[i++]=bitHash->map(b); 135 // Fill all-one entries inside the range. 136 if(i<j) { 137 uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff)); 138 do { 139 index[i++]=all; 140 } while(i<j); 141 } 142 b=INT64_C(0xffffffffffffffff); 143 } 144 /* i==j */ 145 b&=(INT64_C(1)<<(end&0x3f))-1; 146 prevIndex=j; 147 } 148 149 if(bitHash->countKeys()>LENGTHOF(shortBits)) { 150 bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8); 151 } 152 if(bits!=NULL) { 153 bitHash->invert(bits); 154 } else { 155 bits=shortBits; 156 errorCode=U_MEMORY_ALLOCATION_ERROR; 157 return; 158 } 159 160 latin1Set[0]=(uint32_t)bits[0]; 161 latin1Set[1]=(uint32_t)(bits[0]>>32); 162 latin1Set[2]=(uint32_t)bits[1]; 163 latin1Set[3]=(uint32_t)(bits[1]>>32); 164 latin1Set[4]=(uint32_t)bits[2]; 165 latin1Set[5]=(uint32_t)(bits[2]>>32); 166 latin1Set[6]=(uint32_t)bits[3]; 167 latin1Set[7]=(uint32_t)(bits[3]>>32); 168 169 restSet.remove(0, 0xffff); 170 } 171 172 ~BitSet() { 173 if(bits!=shortBits) { 174 uprv_free(bits); 175 } 176 delete restSet; 177 } 178 179 UBool contains(UChar32 c) const { 180 if((uint32_t)c<=0xff) { 181 return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0); 182 } else if((uint32_t)c<0xffff) { 183 return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0); 184 } else { 185 return restSet->contains(c); 186 } 187 } 188 189 private: 190 uint16_t index[0x400]; 191 int64_t shortBits[32]; 192 int64_t *bits; 193 194 uint32_t latin1Bits[8]; 195 196 UnicodeSet *restSet; 197 }; 198