Home | History | Annotate | Download | only in draft
      1 /*
      2 *************************************************************************
      3 *    2016 and later: Unicode, Inc. and others.
      4 *   License & terms of use: http://www.unicode.org/copyright.html#License
      5 *************************************************************************
      6 *************************************************************************
      7 *   Copyright (C) 2014, International Business Machines
      8 *   Corporation and others.  All Rights Reserved.
      9 *************************************************************************
     10 *   file name:  bitset.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2007jan15
     16 *   created by: Markus Scherer
     17 *
     18 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
     19 *   using a folded bit set consisting of a 1k-entry index table and a
     20 *   compacted array of 64-bit words.
     21 *   Uses a simple hash table for compaction.
     22 *   Uses the original set for supplementary code points.
     23 */
     24 
     25 #include "unicode/utypes.h"
     26 #include "unicont.h"
     27 #include "cmemory.h" // for UPRV_LENGTHOF
     28 
     29 /*
     30  * Hash table for up to 1k 64-bit words, for 1 bit per BMP code point.
     31  * Hashes 64-bit words and maps them to 16-bit integers which are
     32  * assigned in order of new incoming words for subsequent storage
     33  * in a contiguous array.
     34  */
     35 struct BMPBitHash : public UObject {
     36     int64_t keys[0x800];  // 2k
     37     uint16_t values[0x800];
     38     uint16_t reverse[0x400];
     39     uint16_t count;
     40     const int32_t prime=1301;  // Less than 2k.
     41 
     42     BMPBitHash() : count(0) {
     43         // Fill values[] with 0xffff.
     44         uprv_memset(values, 0xff, sizeof(values));
     45     }
     46 
     47     /*
     48      * Map a key to an integer count.
     49      * Map at most 1k=0x400 different keys with this data structure.
     50      */
     51     uint16_t map(int64_t key) {
     52         int32_t hash=(int32_t)(key>>55)&0x1ff;
     53         hash^=(int32_t)(key>>44)&0x7ff;
     54         hash^=(int32_t)(key>>33)&0x7ff;
     55         hash^=(int32_t)(key>>22)&0x7ff;
     56         hash^=(int32_t)(key>>11)&0x7ff;
     57         hash^=(int32_t)key&0x7ff;
     58         for(;;) {
     59             if(values[hash]==0xffff) {
     60                 // Unused slot.
     61                 keys[hash]=key;
     62                 reverse[count]=hash;
     63                 return values[hash]=count++;
     64             } else if(keys[hash]==key) {
     65                 // Found a slot with this key.
     66                 return values[hash];
     67             } else {
     68                 // Used slot with a different key, move to another slot.
     69                 hash=(hash+prime)&0x7ff;
     70             }
     71         }
     72     }
     73 
     74     uint16_t countKeys() const { return count; }
     75 
     76     /*
     77      * Invert the hash map: Fill an array of length countKeys() with the keys
     78      * indexed by their mapped values.
     79      */
     80     void invert(int64_t *k) const {
     81         uint16_t i;
     82 
     83         for(i=0; i<count; ++i) {
     84             k[i]=keys[reverse[i]];
     85         }
     86     }
     87 };
     88 
     89 class BitSet : public UObject, public UnicodeContainable {
     90 public:
     91     BitSet(const UnicodeSet &set, UErrorCode &errorCode) : bits(shortBits), restSet(set.clone()) {
     92         if(U_FAILURE(errorCode)) {
     93             return;
     94         }
     95         BMPBitHash *bitHash=new BMPBitHash;
     96         if(bitHash==NULL || restSet==NULL) {
     97             errorCode=U_MEMORY_ALLOCATION_ERROR;
     98             return;
     99         }
    100 
    101         UnicodeSetIterator iter(set);
    102         int64_t b;
    103         UChar32 start, end;
    104         int32_t prevIndex, i, j;
    105 
    106         b=0;  // Not necessary but makes compilers happy.
    107         prevIndex=-1;
    108         for(;;) {
    109             if(iter.nextRange() && !iter.isString()) {
    110                 start=iter.getCodepoint();
    111                 end=iter.getCodepointEnd();
    112             } else {
    113                 start=0x10000;
    114             }
    115             i=start>>6;
    116             if(prevIndex!=i) {
    117                 // Finish the end of the previous range.
    118                 if(prevIndex<0) {
    119                     prevIndex=0;
    120                 } else {
    121                     index[prevIndex++]=bitHash->map(b);
    122                 }
    123                 // Fill all-zero entries between ranges.
    124                 if(prevIndex<i) {
    125                     uint16_t zero=bitHash->map(0);
    126                     do {
    127                         index[prevIndex++]=zero;
    128                     } while(prevIndex<i);
    129                 }
    130                 b=0;
    131             }
    132             if(start>0xffff) {
    133                 break;
    134             }
    135             b|=~((INT64_C(1)<<(start&0x3f))-1);
    136             j=end>>6;
    137             if(i<j) {
    138                 // Set bits for the start of the range.
    139                 index[i++]=bitHash->map(b);
    140                 // Fill all-one entries inside the range.
    141                 if(i<j) {
    142                     uint16_t all=bitHash->map(INT64_C(0xffffffffffffffff));
    143                     do {
    144                         index[i++]=all;
    145                     } while(i<j);
    146                 }
    147                 b=INT64_C(0xffffffffffffffff);
    148             }
    149             /* i==j */
    150             b&=(INT64_C(1)<<(end&0x3f))-1;
    151             prevIndex=j;
    152         }
    153 
    154         if(bitHash->countKeys()>UPRV_LENGTHOF(shortBits)) {
    155             bits=(int64_t *)uprv_malloc(bitHash->countKeys()*8);
    156         }
    157         if(bits!=NULL) {
    158             bitHash->invert(bits);
    159         } else {
    160             bits=shortBits;
    161             errorCode=U_MEMORY_ALLOCATION_ERROR;
    162             return;
    163         }
    164 
    165         latin1Set[0]=(uint32_t)bits[0];
    166         latin1Set[1]=(uint32_t)(bits[0]>>32);
    167         latin1Set[2]=(uint32_t)bits[1];
    168         latin1Set[3]=(uint32_t)(bits[1]>>32);
    169         latin1Set[4]=(uint32_t)bits[2];
    170         latin1Set[5]=(uint32_t)(bits[2]>>32);
    171         latin1Set[6]=(uint32_t)bits[3];
    172         latin1Set[7]=(uint32_t)(bits[3]>>32);
    173 
    174         restSet.remove(0, 0xffff);
    175     }
    176 
    177     ~BitSet() {
    178         if(bits!=shortBits) {
    179             uprv_free(bits);
    180         }
    181         delete restSet;
    182     }
    183 
    184     UBool contains(UChar32 c) const {
    185         if((uint32_t)c<=0xff) {
    186             return (UBool)((latin1Set[c>>5]&((uint32_t)1<<(c&0x1f)))!=0);
    187         } else if((uint32_t)c<0xffff) {
    188             return (UBool)((bits[c>>6]&(INT64_C(1)<<(c&0x3f)))!=0);
    189         } else {
    190             return restSet->contains(c);
    191         }
    192     }
    193 
    194 private:
    195     uint16_t index[0x400];
    196     int64_t shortBits[32];
    197     int64_t *bits;
    198 
    199     uint32_t latin1Bits[8];
    200 
    201     UnicodeSet *restSet;
    202 };
    203