Home | History | Annotate | Download | only in intltest
      1 /*
      2  ******************************************************************************
      3  *   Copyright (C) 1996-2012, International Business Machines                 *
      4  *   Corporation and others.  All Rights Reserved.                            *
      5  ******************************************************************************
      6  */
      7 
      8 /**
      9  * \file
     10  * \brief Originally, added as C++ API for Collation data used to compute minLengthInChars
     11  * \internal
     12  */
     13 
     14 /*
     15  * Note: This module was incldued in ICU 4.0.1 as @internal technology preview for supporting
     16  * Boyer-Moore string search API. For now, only SSearchTest depends on this module. I temporaly
     17  * moved the module from i18n directory to intltest, because we have no plan to publish this
     18  * as public API. (2012-12-18 yoshito)
     19  */
     20 
     21 #ifndef COLL_DATA_H
     22 #define COLL_DATA_H
     23 
     24 #include "unicode/utypes.h"
     25 
     26 #if !UCONFIG_NO_COLLATION
     27 
     28 #include "unicode/ucol.h"
     29 #include "unicode/unistr.h"
     30 
     31  /**
     32   * The size of the internal CE buffer in a <code>CEList</code> object
     33   */
     34 #define CELIST_BUFFER_SIZE 4
     35 
     36 /**
     37  * \def INSTRUMENT_CELIST
     38  * Define this to enable the <code>CEList</code> objects to collect
     39  * statistics.
     40  */
     41 
     42  /**
     43   * The size of the initial list in a <code>StringList</code> object.
     44   */
     45 #define STRING_LIST_BUFFER_SIZE 16
     46 
     47 U_NAMESPACE_USE
     48 
     49  /**
     50   * This object holds a list of CEs generated from a particular
     51   * <code>UnicodeString</code>
     52   *
     53   */
     54 class CEList
     55 {
     56 public:
     57     /**
     58      * Construct a <code>CEList</code> object.
     59      *
     60      * @param coll - the Collator used to collect the CEs.
     61      * @param string - the string for which to collect the CEs.
     62      * @param status - will be set if any errors occur.
     63      *
     64      * Note: if on return, status is set to an error code,
     65      * the only safe thing to do with this object is to call
     66      * the destructor.
     67      */
     68     CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status);
     69 
     70     /**
     71      * The destructor.
     72      */
     73     ~CEList();
     74 
     75     /**
     76      * Return the number of CEs in the list.
     77      *
     78      * @return the number of CEs in the list.
     79      */
     80     int32_t size() const;
     81 
     82     /**
     83      * Get a particular CE from the list.
     84      *
     85      * @param index - the index of the CE to return
     86      *
     87      * @return the CE, or <code>0</code> if <code>index</code> is out of range
     88      */
     89     uint32_t get(int32_t index) const;
     90 
     91     /**
     92      * Check if the CEs in another <code>CEList</code> match the
     93      * suffix of this list starting at a give offset.
     94      *
     95      * @param offset - the offset of the suffix
     96      * @param other - the other <code>CEList</code>
     97      *
     98      * @return <code>TRUE</code> if the CEs match, <code>FALSE</code> otherwise.
     99      */
    100     UBool matchesAt(int32_t offset, const CEList *other) const;
    101 
    102     /**
    103      * The index operator.
    104      *
    105      * @param index - the index
    106      *
    107      * @return a reference to the given CE in the list
    108      */
    109     uint32_t &operator[](int32_t index) const;
    110 
    111 private:
    112     void add(uint32_t ce, UErrorCode &status);
    113 
    114     uint32_t ceBuffer[CELIST_BUFFER_SIZE];
    115     uint32_t *ces;
    116     int32_t listMax;
    117     int32_t listSize;
    118 };
    119 
    120 /**
    121  * StringList
    122  *
    123  * This object holds a list of <code>UnicodeString</code> objects.
    124  */
    125 class StringList
    126 {
    127 public:
    128     /**
    129      * Construct an empty <code>StringList</code>
    130      *
    131      * @param status - will be set if any errors occur.
    132      *
    133      * Note: if on return, status is set to an error code,
    134      * the only safe thing to do with this object is to call
    135      * the destructor.
    136      */
    137     StringList(UErrorCode &status);
    138 
    139     /**
    140      * The destructor.
    141      */
    142     ~StringList();
    143 
    144     /**
    145      * Add a string to the list.
    146      *
    147      * @param string - the string to add
    148      * @param status - will be set if any errors occur.
    149      */
    150     void add(const UnicodeString *string, UErrorCode &status);
    151 
    152     /**
    153      * Add an array of Unicode code points to the list.
    154      *
    155      * @param chars - the address of the array of code points
    156      * @param count - the number of code points in the array
    157      * @param status - will be set if any errors occur.
    158      */
    159     void add(const UChar *chars, int32_t count, UErrorCode &status);
    160 
    161     /**
    162      * Get a particular string from the list.
    163      *
    164      * @param index - the index of the string
    165      *
    166      * @return a pointer to the <code>UnicodeString</code> or <code>NULL</code>
    167      *         if <code>index</code> is out of bounds.
    168      */
    169     const UnicodeString *get(int32_t index) const;
    170 
    171     /**
    172      * Get the number of stings in the list.
    173      *
    174      * @return the number of strings in the list.
    175      */
    176     int32_t size() const;
    177 
    178 private:
    179     UnicodeString *strings;
    180     int32_t listMax;
    181     int32_t listSize;
    182 };
    183 
    184 
    185 /*
    186  * Forward references to internal classes.
    187  */
    188 class StringToCEsMap;
    189 class CEToStringsMap;
    190 
    191 /**
    192  * CollData
    193  *
    194  * This class holds the Collator-specific data needed to
    195  * compute the length of the shortest string that can
    196  * generate a partcular list of CEs.
    197  *
    198  * <code>CollData</code> objects are quite expensive to compute. Because
    199  * of this, they are cached. When you call <code>CollData::open</code> it
    200  * returns a reference counted cached object. When you call <code>CollData::close</code>
    201  * the reference count on the object is decremented but the object is not deleted.
    202  *
    203  * If you do not need to reuse any unreferenced objects in the cache, you can call
    204  * <code>CollData::flushCollDataCache</code>. If you no longer need any <code>CollData</code>
    205  * objects, you can call <code>CollData::freeCollDataCache</code>
    206  */
    207 class CollData
    208 {
    209 public:
    210     /**
    211      * Construct a <code>CollData</code> object.
    212      *
    213      * @param collator - the collator
    214      * @param status - will be set if any errors occur.
    215      */
    216     CollData(UCollator *collator, UErrorCode &status);
    217 
    218     /**
    219      * The destructor.
    220      */
    221     ~CollData();
    222 
    223     /**
    224      * Get the <code>UCollator</code> object used to create this object.
    225      * The object returned may not be the exact object that was used to
    226      * create this object, but it will have the same behavior.
    227      */
    228     UCollator *getCollator() const;
    229 
    230     /**
    231      * Get a list of all the strings which generate a list
    232      * of CEs starting with a given CE.
    233      *
    234      * @param ce - the CE
    235      *
    236      * return a <code>StringList</code> object containing all
    237      *        the stirngs, or <code>NULL</code> if there are
    238      *        no such strings.
    239      */
    240     const StringList *getStringList(int32_t ce) const;
    241 
    242     /**
    243      * Get a list of the CEs generated by a partcular stirng.
    244      *
    245      * @param string - the string
    246      *
    247      * @return a <code>CEList</code> object containt the CEs. You
    248      *         must call <code>freeCEList</code> when you are finished
    249      *         using the <code>CEList</code>/
    250      */
    251     const CEList *getCEList(const UnicodeString *string) const;
    252 
    253     /**
    254      * Release a <code>CEList</code> returned by <code>getCEList</code>.
    255      *
    256      * @param list - the <code>CEList</code> to free.
    257      */
    258     void freeCEList(const CEList *list);
    259 
    260     /**
    261      * Return the length of the shortest string that will generate
    262      * the given list of CEs.
    263      *
    264      * @param ces - the CEs
    265      * @param offset - the offset of the first CE in the list to use.
    266      *
    267      * @return the length of the shortest string.
    268      */
    269     int32_t minLengthInChars(const CEList *ces, int32_t offset) const;
    270 
    271 
    272     /**
    273      * Return the length of the shortest string that will generate
    274      * the given list of CEs.
    275      *
    276      * Note: the algorithm used to do this computation is recursive. To
    277      * limit the amount of recursion, a "history" list is used to record
    278      * the best answer starting at a particular offset in the list of CEs.
    279      * If the same offset is visited again during the recursion, the answer
    280      * in the history list is used.
    281      *
    282      * @param ces - the CEs
    283      * @param offset - the offset of the first CE in the list to use.
    284      * @param history - the history list. Must be at least as long as
    285      *                 the number of cEs in the <code>CEList</code>
    286      *
    287      * @return the length of the shortest string.
    288      */
    289    int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const;
    290 
    291 private:
    292     UCollator      *coll;
    293     CEToStringsMap *ceToCharsStartingWith;
    294 
    295     uint32_t minHan;
    296     uint32_t maxHan;
    297 
    298     uint32_t jamoLimits[4];
    299 };
    300 
    301 #endif // #if !UCONFIG_NO_COLLATION
    302 #endif // #ifndef COLL_DATA_H
    303