Home | History | Annotate | Download | only in draft
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2007, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   file name:  trieset.cpp
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2007jan15
     12 *   created by: Markus Scherer
     13 *
     14 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
     15 *   using a UTrie with 8-bit (byte) results per code point.
     16 *   Modifies the trie index to make the BMP linear, and uses the original set
     17 *   for supplementary code points.
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 #include "unicont.h"
     22 
     23 #define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
     24 
     25 #define UTRIE_GET8_FROM_LEAD(trie, c16) \
     26     ((const uint8_t *)(trie)->data32)[ \
     27         ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \
     28         ((c16)&UTRIE_MASK) \
     29     ]
     30 
     31 class TrieSet : public UObject, public UnicodeContainable {
     32 public:
     33     TrieSet(const UnicodeSet &set, UErrorCode &errorCode)
     34             : trieData(NULL), latin1(NULL), restSet(set.clone()) {
     35         if(U_FAILURE(errorCode)) {
     36             return;
     37         }
     38         if(restSet==NULL) {
     39             errorCode=U_MEMORY_ALLOCATION_ERROR;
     40             return;
     41         }
     42 
     43         UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE);
     44         UChar32 start, end;
     45 
     46         UnicodeSetIterator iter(set);
     47 
     48         while(iter.nextRange() && !iter.isString()) {
     49             start=iter.getCodepoint();
     50             end=iter.getCodepointEnd();
     51             if(start>0xffff) {
     52                 break;
     53             }
     54             if(end>0xffff) {
     55                 end=0xffff;
     56             }
     57             if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) {
     58                 errorCode=U_INTERNAL_PROGRAM_ERROR;
     59                 return;
     60             }
     61         }
     62 
     63         // Preflight the trie length.
     64         int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode);
     65         if(errorCode!=U_BUFFER_OVERFLOW_ERROR) {
     66             return;
     67         }
     68 
     69         trieData=(uint32_t *)uprv_malloc(length);
     70         if(trieData==NULL) {
     71             errorCode=U_MEMORY_ALLOCATION_ERROR;
     72             return;
     73         }
     74 
     75         errorCode=U_ZERO_ERROR;
     76         utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode);
     77         utrie_unserialize(&trie, trieData, length, &errorCode);  // TODO: Implement for 8-bit UTrie!
     78 
     79         if(U_SUCCESS(errorCode)) {
     80             // Copy the indexes for surrogate code points into the BMP range
     81             // for simple access across the entire BMP.
     82             uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT),
     83                         trie.index+UTRIE_BMP_INDEX_LENGTH,
     84                         (0x800>>UTRIE_SHIFT)*2);
     85             latin1=UTRIE_GET8_LATIN1(&trie);
     86         }
     87 
     88         restSet.remove(0, 0xffff);
     89     }
     90 
     91     ~TrieSet() {
     92         uprv_free(trieData);
     93         delete restSet;
     94     }
     95 
     96     UBool contains(UChar32 c) const {
     97         if((uint32_t)c<=0xff) {
     98             return (UBool)latin1[c];
     99         } else if((uint32_t)c<0xffff) {
    100             return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c);
    101         } else {
    102             return restSet->contains(c);
    103         }
    104     }
    105 
    106 private:
    107     uint32_t *trieData;
    108     const uint8_t *latin1;
    109     UTrie trie;
    110     UnicodeSet *restSet;
    111 };
    112