Home | History | Annotate | Download | only in draft
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2007, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   file name:  bitset.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2007jan15
     12 *   created by: Markus Scherer
     13 *
     14 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
     15 *   using a folded bit set consisting of a 1k-entry index table and a
     16 *   compacted array of 64-bit words.
     17 *   Uses a simple hash table for compaction.
     18 *   Uses the original set for supplementary code points.
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 #include "unicont.h"
     23 
     24 /*
     25  * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
     26  * Hashes 64-bit words and maps them to 16-bit integers which are
     27  * assigned in order of new incoming words for subsequent storage
     28  * in a contiguous array.
     29  */
     30 struct BMPBitHash : public UObject {
     31     int64_t keys[0x800];  // 2k
     32     uint16_t values[0x800];
     33     uint16_t reverse[0x400];
     34     uint16_t count;
     35     const int32_t prime=1301;  // Less than 2k.
     36 
     37     BMPBitHash() : count(0) {
     38         // Fill values[] with 0xffff.
     39         uprv_memset(values, 0xff, sizeof(values));
     40     }
     41 
     42     /*
     43      * Map a key to an integer count.
     44      * Map at most 1k=0x400 different keys with this data structure.
     45      */
     46     uint16_t map(int64_t key) {
     47         int32_t hash=(int32_t)(key>>55)&0x1ff;
     48         hash^=(int32_t)(key>>44)&0x7ff;
     49         hash^=(int32_t)(key>>33)&0x7ff;
     50         hash^=(int32_t)(key>>22)&0x7ff;
     51         hash^=(int32_t)(key>>11)&0x7ff;
     52         hash^=(int32_t)key&0x7ff;
     53         for(;;) {
     54             if(values[hash]==0xffff) {
     55                 // Unused slot.
     56                 keys[hash]=key;
     57                 reverse[count]=hash;
     58                 return values[hash]=count++;
     59             } else if(keys[hash]==key) {
     60                 // Found a slot with this key.
     61                 return values[hash];
     62             } else {
     63                 // Used slot with a different key, move to another slot.
     64                 hash=(hash+prime)&0x7ff;
     65             }
     66         }
     67     }
     68 
     69     uint16_t countKeys() const { return count; }
     70 
     71     /*
     72      * Invert the hash map: Fill an array of length countKeys() with the keys
     73      * indexed by their mapped values.
     74      */
     75     void invert(int64_t *k) const {
     76         uint16_t i;
     77 
     78         for(i=0; i<count; ++i) {
     79             k[i]=keys[reverse[i]];
     80         }
     81     }
     82 };
     83 
     84 class BitSet : public UObject, public UnicodeContainable {
     85 public:
     86     BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
     87         if(U_FAILURE(errorCode)) {
     88             return;
     89         }
     90         BMPBitHash *bitHash=new BMPBitHash;
     91         if(bitHash==NULL || restSet==NULL) {
     92             errorCode=U_MEMORY_ALLOCATION_ERROR;
     93             return;
     94         }
     95 
     96         UnicodeSetIterator iter(set);
     97         int64_t b;
     98         UChar32 start, end;
     99         int32_t prevIndex, i, j;
    100 
    101         b=0;  // Not necessary but makes compilers happy.
    102         prevIndex=-1;
    103         for(;;) {
    104             if(iter.nextRange() && !iter.isString()) {
    105                 start=iter.getCodepoint();
    106                 end=iter.getCodepointEnd();
    107             } else {
    108                 start=0x10000;
    109             }
    110             i=start>>6;
    111             if(prevIndex!=i) {
    112                 // Finish the end of the previous range.
    113                 if(prevIndex<0) {
    114                     prevIndex=0;
    115                 } else {
    116                     index[prevIndex++]=bitHash->map(b);
    117                 }
    118                 // Fill all-zero entries between ranges.
    119                 if(prevIndex<i) {
    120                     uint16_t zero=bitHash->map(0);
    121                     do {
    122                         index[prevIndex++]=zero;
    123                     } while(prevIndex<i);
    124                 }
    125                 b=0;
    126             }
    127             if(start>0xffff) {
    128                 break;
    129             }
    130             b|=~((INT64_C(1)<<(start&0x3f))-1);
    131             j=end>>6;
    132             if(i<j) {
    133                 // Set bits for the start of the range.
    134                 index[i++]=bitHash->map(b);
    135                 // Fill all-one entries inside the range.
    136                 if(i<j) {
    137                     uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
    138                     do {
    139                         index[i++]=all;
    140                     } while(i<j);
    141                 }
    142                 b=INT64_C(0xffffffffffffffff);
    143             }
    144             /* i==j */
    145             b&=(INT64_C(1)<<(end&0x3f))-1;
    146             prevIndex=j;
    147         }
    148 
    149         if(bitHash->countKeys()>LENGTHOF(shortBits)) {
    150             bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
    151         }
    152         if(bits!=NULL) {
    153             bitHash->invert(bits);
    154         } else {
    155             bits=shortBits;
    156             errorCode=U_MEMORY_ALLOCATION_ERROR;
    157             return;
    158         }
    159 
    160         latin1Set[0]=(uint32_t)bits[0];
    161         latin1Set[1]=(uint32_t)(bits[0]>>32);
    162         latin1Set[2]=(uint32_t)bits[1];
    163         latin1Set[3]=(uint32_t)(bits[1]>>32);
    164         latin1Set[4]=(uint32_t)bits[2];
    165         latin1Set[5]=(uint32_t)(bits[2]>>32);
    166         latin1Set[6]=(uint32_t)bits[3];
    167         latin1Set[7]=(uint32_t)(bits[3]>>32);
    168 
    169         restSet.remove(0, 0xffff);
    170     }
    171 
    172     ~BitSet() {
    173         if(bits!=shortBits) {
    174             uprv_free(bits);
    175         }
    176         delete restSet;
    177     }
    178 
    179     UBool contains(UChar32 c) const {
    180         if((uint32_t)c<=0xff) {
    181             return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
    182         } else if((uint32_t)c<0xffff) {
    183             return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
    184         } else {
    185             return restSet->contains(c);
    186         }
    187     }
    188 
    189 private:
    190     uint16_t index[0x400];
    191     int64_t shortBits[32];
    192     int64_t *bits;
    193 
    194     uint32_t latin1Bits[8];
    195 
    196     UnicodeSet *restSet;
    197 };
    198