Home | History | Annotate | Download | only in draft
      1 /*
      2 *************************************************************************
      3 *   Copyright (C) 2016 and later: Unicode, Inc. and others.
      4 *   License & terms of use: http://www.unicode.org/copyright.html#License
      5 *************************************************************************
      6 *************************************************************************
      7 *   Copyright (C) 2007, International Business Machines
      8 *   Corporation and others.  All Rights Reserved.
      9 *************************************************************************
     10 *   file name:  trieset.cpp
     11 *   encoding:   US-ASCII
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2007jan15
     16 *   created by: Markus Scherer
     17 *
     18 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
     19 *   using a UTrie with 8-bit (byte) results per code point.
     20 *   Modifies the trie index to make the BMP linear, and uses the original set
     21 *   for supplementary code points.
     22 */
     23 
     24 #include "unicode/utypes.h"
     25 #include "unicont.h"
     26 
     27 #define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
     28 
     29 #define UTRIE_GET8_FROM_LEAD(trie, c16) \
     30     ((const uint8_t *)(trie)->data32)[ \
     31         ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \
     32         ((c16)&UTRIE_MASK) \
     33     ]
     34 
     35 class TrieSet : public UObject, public UnicodeContainable {
     36 public:
     37     TrieSet(const UnicodeSet &set, UErrorCode &errorCode)
     38             : trieData(NULL), latin1(NULL), restSet(set.clone()) {
     39         if(U_FAILURE(errorCode)) {
     40             return;
     41         }
     42         if(restSet==NULL) {
     43             errorCode=U_MEMORY_ALLOCATION_ERROR;
     44             return;
     45         }
     46 
     47         UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE);
     48         UChar32 start, end;
     49 
     50         UnicodeSetIterator iter(set);
     51 
     52         while(iter.nextRange() && !iter.isString()) {
     53             start=iter.getCodepoint();
     54             end=iter.getCodepointEnd();
     55             if(start>0xffff) {
     56                 break;
     57             }
     58             if(end>0xffff) {
     59                 end=0xffff;
     60             }
     61             if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) {
     62                 errorCode=U_INTERNAL_PROGRAM_ERROR;
     63                 return;
     64             }
     65         }
     66 
     67         // Preflight the trie length.
     68         int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode);
     69         if(errorCode!=U_BUFFER_OVERFLOW_ERROR) {
     70             return;
     71         }
     72 
     73         trieData=(uint32_t *)uprv_malloc(length);
     74         if(trieData==NULL) {
     75             errorCode=U_MEMORY_ALLOCATION_ERROR;
     76             return;
     77         }
     78 
     79         errorCode=U_ZERO_ERROR;
     80         utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode);
     81         utrie_unserialize(&trie, trieData, length, &errorCode);  // TODO: Implement for 8-bit UTrie!
     82 
     83         if(U_SUCCESS(errorCode)) {
     84             // Copy the indexes for surrogate code points into the BMP range
     85             // for simple access across the entire BMP.
     86             uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT),
     87                         trie.index+UTRIE_BMP_INDEX_LENGTH,
     88                         (0x800>>UTRIE_SHIFT)*2);
     89             latin1=UTRIE_GET8_LATIN1(&trie);
     90         }
     91 
     92         restSet.remove(0, 0xffff);
     93     }
     94 
     95     ~TrieSet() {
     96         uprv_free(trieData);
     97         delete restSet;
     98     }
     99 
    100     UBool contains(UChar32 c) const {
    101         if((uint32_t)c<=0xff) {
    102             return (UBool)latin1[c];
    103         } else if((uint32_t)c<0xffff) {
    104             return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c);
    105         } else {
    106             return restSet->contains(c);
    107         }
    108     }
    109 
    110 private:
    111     uint32_t *trieData;
    112     const uint8_t *latin1;
    113     UTrie trie;
    114     UnicodeSet *restSet;
    115 };
    116