Home | History | Annotate | Download | only in draft
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2014, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   file name:  bitset.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2007jan15
     12 *   created by: Markus Scherer
     13 *
     14 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
     15 *   using a folded bit set consisting of a 1k-entry index table and a
     16 *   compacted array of 64-bit words.
     17 *   Uses a simple hash table for compaction.
     18 *   Uses the original set for supplementary code points.
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicont.h"
     23 #include "cmemory.h" // for UPRV_LENGTHOF
     24 
     25 /*
     26  * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
     27  * Hashes 64-bit words and maps them to 16-bit integers which are
     28  * assigned in order of new incoming words for subsequent storage
     29  * in a contiguous array.
     30  */
     31 struct BMPBitHash : public UObject {
     32     int64_t keys[0x800];  // 2k
     33     uint16_t values[0x800];
     34     uint16_t reverse[0x400];
     35     uint16_t count;
     36     const int32_t prime=1301;  // Less than 2k.
     37 
     38     BMPBitHash() : count(0) {
     39         // Fill values[] with 0xffff.
     40         uprv_memset(values, 0xff, sizeof(values));
     41     }
     42 
     43     /*
     44      * Map a key to an integer count.
     45      * Map at most 1k=0x400 different keys with this data structure.
     46      */
     47     uint16_t map(int64_t key) {
     48         int32_t hash=(int32_t)(key>>55)&0x1ff;
     49         hash^=(int32_t)(key>>44)&0x7ff;
     50         hash^=(int32_t)(key>>33)&0x7ff;
     51         hash^=(int32_t)(key>>22)&0x7ff;
     52         hash^=(int32_t)(key>>11)&0x7ff;
     53         hash^=(int32_t)key&0x7ff;
     54         for(;;) {
     55             if(values[hash]==0xffff) {
     56                 // Unused slot.
     57                 keys[hash]=key;
     58                 reverse[count]=hash;
     59                 return values[hash]=count++;
     60             } else if(keys[hash]==key) {
     61                 // Found a slot with this key.
     62                 return values[hash];
     63             } else {
     64                 // Used slot with a different key, move to another slot.
     65                 hash=(hash+prime)&0x7ff;
     66             }
     67         }
     68     }
     69 
     70     uint16_t countKeys() const { return count; }
     71 
     72     /*
     73      * Invert the hash map: Fill an array of length countKeys() with the keys
     74      * indexed by their mapped values.
     75      */
     76     void invert(int64_t *k) const {
     77         uint16_t i;
     78 
     79         for(i=0; i<count; ++i) {
     80             k[i]=keys[reverse[i]];
     81         }
     82     }
     83 };
     84 
     85 class BitSet : public UObject, public UnicodeContainable {
     86 public:
     87     BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
     88         if(U_FAILURE(errorCode)) {
     89             return;
     90         }
     91         BMPBitHash *bitHash=new BMPBitHash;
     92         if(bitHash==NULL || restSet==NULL) {
     93             errorCode=U_MEMORY_ALLOCATION_ERROR;
     94             return;
     95         }
     96 
     97         UnicodeSetIterator iter(set);
     98         int64_t b;
     99         UChar32 start, end;
    100         int32_t prevIndex, i, j;
    101 
    102         b=0;  // Not necessary but makes compilers happy.
    103         prevIndex=-1;
    104         for(;;) {
    105             if(iter.nextRange() && !iter.isString()) {
    106                 start=iter.getCodepoint();
    107                 end=iter.getCodepointEnd();
    108             } else {
    109                 start=0x10000;
    110             }
    111             i=start>>6;
    112             if(prevIndex!=i) {
    113                 // Finish the end of the previous range.
    114                 if(prevIndex<0) {
    115                     prevIndex=0;
    116                 } else {
    117                     index[prevIndex++]=bitHash->map(b);
    118                 }
    119                 // Fill all-zero entries between ranges.
    120                 if(prevIndex<i) {
    121                     uint16_t zero=bitHash->map(0);
    122                     do {
    123                         index[prevIndex++]=zero;
    124                     } while(prevIndex<i);
    125                 }
    126                 b=0;
    127             }
    128             if(start>0xffff) {
    129                 break;
    130             }
    131             b|=~((INT64_C(1)<<(start&0x3f))-1);
    132             j=end>>6;
    133             if(i<j) {
    134                 // Set bits for the start of the range.
    135                 index[i++]=bitHash->map(b);
    136                 // Fill all-one entries inside the range.
    137                 if(i<j) {
    138                     uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
    139                     do {
    140                         index[i++]=all;
    141                     } while(i<j);
    142                 }
    143                 b=INT64_C(0xffffffffffffffff);
    144             }
    145             /* i==j */
    146             b&=(INT64_C(1)<<(end&0x3f))-1;
    147             prevIndex=j;
    148         }
    149 
    150         if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
    151             bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
    152         }
    153         if(bits!=NULL) {
    154             bitHash->invert(bits);
    155         } else {
    156             bits=shortBits;
    157             errorCode=U_MEMORY_ALLOCATION_ERROR;
    158             return;
    159         }
    160 
    161         latin1Set[0]=(uint32_t)bits[0];
    162         latin1Set[1]=(uint32_t)(bits[0]>>32);
    163         latin1Set[2]=(uint32_t)bits[1];
    164         latin1Set[3]=(uint32_t)(bits[1]>>32);
    165         latin1Set[4]=(uint32_t)bits[2];
    166         latin1Set[5]=(uint32_t)(bits[2]>>32);
    167         latin1Set[6]=(uint32_t)bits[3];
    168         latin1Set[7]=(uint32_t)(bits[3]>>32);
    169 
    170         restSet.remove(0, 0xffff);
    171     }
    172 
    173     ~BitSet() {
    174         if(bits!=shortBits) {
    175             uprv_free(bits);
    176         }
    177         delete restSet;
    178     }
    179 
    180     UBool contains(UChar32 c) const {
    181         if((uint32_t)c<=0xff) {
    182             return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
    183         } else if((uint32_t)c<0xffff) {
    184             return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
    185         } else {
    186             return restSet->contains(c);
    187         }
    188     }
    189 
    190 private:
    191     uint16_t index[0x400];
    192     int64_t shortBits[32];
    193     int64_t *bits;
    194 
    195     uint32_t latin1Bits[8];
    196 
    197     UnicodeSet *restSet;
    198 };
    199