Home | History | Annotate | Download | only in unicode
      1 /*
      2  ******************************************************************************
      3  *   Copyright (C) 1996-2009, International Business Machines                 *
      4  *   Corporation and others.  All Rights Reserved.                            *
      5  ******************************************************************************
      6  */
      7 
      8 /**
      9  * \file
     10  * \brief C++ API: Collation data used to compute minLengthInChars.
     11  * \internal
     12  */
     13 
     14 #ifndef COLL_DATA_H
     15 #define COLL_DATA_H
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_COLLATION
     20 
     21 #include "unicode/uobject.h"
     22 #include "unicode/ucol.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 /**
     27  * The size of the internal buffer for the Collator's short description string.
     28  */
     29 #define KEY_BUFFER_SIZE 64
     30 
     31  /**
     32   * The size of the internal CE buffer in a <code>CEList</code> object
     33   */
     34 #define CELIST_BUFFER_SIZE 4
     35 
     36 /**
     37  * Define this to enable the <code>CEList</code> objects to collect
     38  * statistics.
     39  */
     40 //#define INSTRUMENT_CELIST
     41 
     42  /**
     43   * The size of the initial list in a <code>StringList</code> object.
     44   */
     45 #define STRING_LIST_BUFFER_SIZE 16
     46 
     47 /**
     48  * Define this to enable the <code>StringList</code> objects to
     49  * collect statistics.
     50  */
     51 //#define INSTRUMENT_STRING_LIST
     52 
     53  /**
     54   * This object holds a list of CEs generated from a particular
     55   * <code>UnicodeString</code>
     56   *
     57   * @internal ICU 4.0.1 technology preview
     58   */
     59 class U_I18N_API CEList : public UObject
     60 {
     61 public:
     62     /**
     63      * Construct a <code>CEList</code> object.
     64      *
     65      * @param coll - the Collator used to collect the CEs.
     66      * @param string - the string for which to collect the CEs.
     67      * @param status - will be set if any errors occur.
     68      *
     69      * Note: if on return, status is set to an error code,
     70      * the only safe thing to do with this object is to call
     71      * the destructor.
     72      *
     73      * @internal ICU 4.0.1 technology preview
     74      */
     75     CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status);
     76 
     77     /**
     78      * The destructor.
     79      */
     80     ~CEList();
     81 
     82     /**
     83      * Return the number of CEs in the list.
     84      *
     85      * @return the number of CEs in the list.
     86      *
     87      * @internal ICU 4.0.1 technology preview
     88      */
     89     int32_t size() const;
     90 
     91     /**
     92      * Get a particular CE from the list.
     93      *
     94      * @param index - the index of the CE to return
     95      *
     96      * @return the CE, or <code>0</code> if <code>index</code> is out of range
     97      *
     98      * @internal ICU 4.0.1 technology preview
     99      */
    100     uint32_t get(int32_t index) const;
    101 
    102     /**
    103      * Check if the CEs in another <code>CEList</code> match the
    104      * suffix of this list starting at a give offset.
    105      *
    106      * @param offset - the offset of the suffix
    107      * @param other - the other <code>CEList</code>
    108      *
    109      * @return <code>TRUE</code> if the CEs match, <code>FALSE</code> otherwise.
    110      *
    111      * @internal ICU 4.0.1 technology preview
    112      */
    113     UBool matchesAt(int32_t offset, const CEList *other) const;
    114 
    115     /**
    116      * The index operator.
    117      *
    118      * @param index - the index
    119      *
    120      * @return a reference to the given CE in the list
    121      *
    122      * @internal ICU 4.0.1 technology preview
    123      */
    124     uint32_t &operator[](int32_t index) const;
    125 
    126     /**
    127      * UObject glue...
    128      */
    129     virtual UClassID getDynamicClassID() const;
    130     /**
    131      * UObject glue...
    132      */
    133     static UClassID getStaticClassID();
    134 
    135 private:
    136     void add(uint32_t ce, UErrorCode &status);
    137 
    138     uint32_t ceBuffer[CELIST_BUFFER_SIZE];
    139     uint32_t *ces;
    140     int32_t listMax;
    141     int32_t listSize;
    142 
    143 #ifdef INSTRUMENT_CELIST
    144     static int32_t _active;
    145     static int32_t _histogram[10];
    146 #endif
    147 };
    148 
    149 /**
    150  * StringList
    151  *
    152  * This object holds a list of <code>UnicodeString</code> objects.
    153  *
    154  * @internal ICU 4.0.1 technology preview
    155  */
    156 class U_I18N_API StringList : public UObject
    157 {
    158 public:
    159     /**
    160      * Construct an empty <code>StringList</code>
    161      *
    162      * @param status - will be set if any errors occur.
    163      *
    164      * Note: if on return, status is set to an error code,
    165      * the only safe thing to do with this object is to call
    166      * the destructor.
    167      *
    168      * @internal ICU 4.0.1 technology preview
    169      */
    170     StringList(UErrorCode &status);
    171 
    172     /**
    173      * The destructor.
    174      *
    175      * @internal ICU 4.0.1 technology preview
    176      */
    177     ~StringList();
    178 
    179     /**
    180      * Add a string to the list.
    181      *
    182      * @param string - the string to add
    183      * @param status - will be set if any errors occur.
    184      *
    185      * @internal ICU 4.0.1 technology preview
    186      */
    187     void add(const UnicodeString *string, UErrorCode &status);
    188 
    189     /**
    190      * Add an array of Unicode code points to the list.
    191      *
    192      * @param chars - the address of the array of code points
    193      * @param count - the number of code points in the array
    194      * @param status - will be set if any errors occur.
    195      *
    196      * @internal ICU 4.0.1 technology preview
    197      */
    198     void add(const UChar *chars, int32_t count, UErrorCode &status);
    199 
    200     /**
    201      * Get a particular string from the list.
    202      *
    203      * @param index - the index of the string
    204      *
    205      * @return a pointer to the <code>UnicodeString</code> or <code>NULL</code>
    206      *         if <code>index</code> is out of bounds.
    207      *
    208      * @internal ICU 4.0.1 technology preview
    209      */
    210     const UnicodeString *get(int32_t index) const;
    211 
    212     /**
    213      * Get the number of stings in the list.
    214      *
    215      * @return the number of strings in the list.
    216      *
    217      * @internal ICU 4.0.1 technology preview
    218      */
    219     int32_t size() const;
    220 
    221     /**
    222      * the UObject glue...
    223      */
    224     virtual UClassID getDynamicClassID() const;
    225     /**
    226      * the UObject glue...
    227      */
    228     static UClassID getStaticClassID();
    229 
    230 private:
    231     UnicodeString *strings;
    232     int32_t listMax;
    233     int32_t listSize;
    234 
    235 #ifdef INSTRUMENT_STRING_LIST
    236     static int32_t _lists;
    237     static int32_t _strings;
    238     static int32_t _histogram[101];
    239 #endif
    240 };
    241 
    242 /*
    243  * Forward references to internal classes.
    244  */
    245 class StringToCEsMap;
    246 class CEToStringsMap;
    247 class CollDataCache;
    248 
    249 /**
    250  * CollData
    251  *
    252  * This class holds the Collator-specific data needed to
    253  * compute the length of the shortest string that can
    254  * generate a partcular list of CEs.
    255  *
    256  * <code>CollData</code> objects are quite expensive to compute. Because
    257  * of this, they are cached. When you call <code>CollData::open</code> it
    258  * returns a reference counted cached object. When you call <code>CollData::close</code>
    259  * the reference count on the object is decremented but the object is not deleted.
    260  *
    261  * If you do not need to reuse any unreferenced objects in the cache, you can call
    262  * <code>CollData::flushCollDataCache</code>. If you no longer need any <code>CollData</code>
    263  * objects, you can call <code>CollData::freeCollDataCache</code>
    264  *
    265  * @internal ICU 4.0.1 technology preview
    266  */
    267 class U_I18N_API CollData : public UObject
    268 {
    269 public:
    270     /**
    271      * Construct a <code>CollData</code> object.
    272      *
    273      * @param collator - the collator
    274      * @param status - will be set if any errors occur.
    275      *
    276      * @return the <code>CollData</code> object. You must call
    277      *         <code>close</code> when you are done using the object.
    278      *
    279      * Note: if on return, status is set to an error code,
    280      * the only safe thing to do with this object is to call
    281      * <code>CollData::close</code>.
    282      *
    283      * @internal ICU 4.0.1 technology preview
    284      */
    285     static CollData *open(UCollator *collator, UErrorCode &status);
    286 
    287     /**
    288      * Release a <code>CollData</code> object.
    289      *
    290      * @param collData - the object
    291      *
    292      * @internal ICU 4.0.1 technology preview
    293      */
    294     static void close(CollData *collData);
    295 
    296     /**
    297      * Get the <code>UCollator</code> object used to create this object.
    298      * The object returned may not be the exact object that was used to
    299      * create this object, but it will have the same behavior.
    300      */
    301     UCollator *getCollator() const;
    302 
    303     /**
    304      * Get a list of all the strings which generate a list
    305      * of CEs starting with a given CE.
    306      *
    307      * @param ce - the CE
    308      *
    309      * return a <code>StringList</code> object containing all
    310      *        the stirngs, or <code>NULL</code> if there are
    311      *        no such strings.
    312      *
    313      * @internal ICU 4.0.1 technology preview.
    314      */
    315     const StringList *getStringList(int32_t ce) const;
    316 
    317     /**
    318      * Get a list of the CEs generated by a partcular stirng.
    319      *
    320      * @param string - the string
    321      *
    322      * @return a <code>CEList</code> object containt the CEs. You
    323      *         must call <code>freeCEList</code> when you are finished
    324      *         using the <code>CEList</code>/
    325      *
    326      * @internal ICU 4.0.1 technology preview.
    327      */
    328     const CEList *getCEList(const UnicodeString *string) const;
    329 
    330     /**
    331      * Release a <code>CEList</code> returned by <code>getCEList</code>.
    332      *
    333      * @param list - the <code>CEList</code> to free.
    334      *
    335      * @internal ICU 4.0.1 technology preview
    336      */
    337     void freeCEList(const CEList *list);
    338 
    339     /**
    340      * Return the length of the shortest string that will generate
    341      * the given list of CEs.
    342      *
    343      * @param ces - the CEs
    344      * @param offset - the offset of the first CE in the list to use.
    345      *
    346      * @return the length of the shortest string.
    347      *
    348      * @internal ICU 4.0.1 technology preview
    349      */
    350     int32_t minLengthInChars(const CEList *ces, int32_t offset) const;
    351 
    352 
    353     /**
    354      * Return the length of the shortest string that will generate
    355      * the given list of CEs.
    356      *
    357      * Note: the algorithm used to do this computation is recursive. To
    358      * limit the amount of recursion, a "history" list is used to record
    359      * the best answer starting at a particular offset in the list of CEs.
    360      * If the same offset is visited again during the recursion, the answer
    361      * in the history list is used.
    362      *
    363      * @param ces - the CEs
    364      * @param offset - the offset of the first CE in the list to use.
    365      * @param history - the history list. Must be at least as long as
    366      *                 the number of cEs in the <code>CEList</code>
    367      *
    368      * @return the length of the shortest string.
    369      *
    370      * @internal ICU 4.0.1 technology preview
    371      */
    372    int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const;
    373 
    374    /**
    375     * UObject glue...
    376     */
    377     virtual UClassID getDynamicClassID() const;
    378    /**
    379     * UObject glue...
    380     */
    381     static UClassID getStaticClassID();
    382 
    383     /**
    384      * <code>CollData</code> objects are expensive to compute, and so
    385      * may be cached. This routine will free the cached objects and delete
    386      * the cache.
    387      *
    388      * WARNING: Don't call this until you are have called <code>close</code>
    389      * for each <code>CollData</code> object that you have used. also,
    390      * DO NOT call this if another thread may be calling <code>flushCollDataCache</code>
    391      * at the same time.
    392      *
    393      * @internal 4.0.1 technology preview
    394      */
    395     static void freeCollDataCache();
    396 
    397     /**
    398      * <code>CollData</code> objects are expensive to compute, and so
    399      * may be cached. This routine will remove any unused <code>CollData</code>
    400      * objects from the cache.
    401      *
    402      * @internal 4.0.1 technology preview
    403      */
    404     static void flushCollDataCache();
    405 
    406 private:
    407     friend class CollDataCache;
    408     friend class CollDataCacheEntry;
    409 
    410     CollData(UCollator *collator, char *cacheKey, int32_t cachekeyLength, UErrorCode &status);
    411     ~CollData();
    412 
    413     CollData();
    414 
    415     static char *getCollatorKey(UCollator *collator, char *buffer, int32_t bufferLength);
    416 
    417     static CollDataCache *getCollDataCache();
    418 
    419     UCollator      *coll;
    420     StringToCEsMap *charsToCEList;
    421     CEToStringsMap *ceToCharsStartingWith;
    422 
    423     char keyBuffer[KEY_BUFFER_SIZE];
    424     char *key;
    425 
    426     static CollDataCache *collDataCache;
    427 
    428     uint32_t minHan;
    429     uint32_t maxHan;
    430 
    431     uint32_t jamoLimits[4];
    432 };
    433 
    434 U_NAMESPACE_END
    435 
    436 #endif // #if !UCONFIG_NO_COLLATION
    437 #endif // #ifndef COLL_DATA_H
    438