Home | History | Annotate | Download | only in coll
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /**
      4 *******************************************************************************
      5 * Copyright (C) 1996-2014, International Business Machines Corporation and
      6 * others. All Rights Reserved.
      7 *******************************************************************************
      8 */
      9 package com.ibm.icu.impl.coll;
     10 
     11 import com.ibm.icu.util.ByteArrayWrapper;
     12 
     13 /**
     14  * <p>Binary Ordered Compression Scheme for Unicode</p>
     15  *
     16  * <p>Users are strongly encouraged to read the ICU paper on
     17  * <a href="http://www.icu-project.org/docs/papers/binary_ordered_compression_for_unicode.html">
     18  * BOCU</a> before attempting to use this class.</p>
     19  *
     20  * <p>BOCU is used to compress unicode text into a stream of unsigned
     21  * bytes.  For many kinds of text the compression compares favorably
     22  * to UTF-8, and for some kinds of text (such as CJK) it does better.
     23  * The resulting bytes will compare in the same order as the original
     24  * code points.  The byte stream does not contain the values 0, 1, or
     25  * 2.</p>
     26  *
     27  * <p>One example of a use of BOCU is in
     28  * com.ibm.icu.text.Collator#getCollationKey(String) for a RuleBasedCollator object with
     29  * collation strength IDENTICAL. The result CollationKey will consist of the
     30  * collation order of the source string followed by the BOCU result of the
     31  * source string.
     32  * </p>
     33  *
     34  * <p>Unlike a UTF encoding, BOCU-compressed text is not suitable for
     35  * random access.</p>
     36  *
     37  * <p>Method: Slope Detection<br> Remember the previous code point
     38  * (initial 0).  For each code point in the string, encode the
     39  * difference with the previous one.  Similar to a UTF, the length of
     40  * the byte sequence is encoded in the lead bytes.  Unlike a UTF, the
     41  * trail byte values may overlap with lead/single byte values.  The
     42  * signedness of the difference must be encoded as the most
     43  * significant part.</p>
     44  *
     45  * <p>We encode differences with few bytes if their absolute values
     46  * are small.  For correct ordering, we must treat the entire value
     47  * range -10ffff..+10ffff in ascending order, which forbids encoding
     48  * the sign and the absolute value separately. Instead, we split the
     49  * lead byte range in the middle and encode non-negative values going
     50  * up and negative values going down.</p>
     51  *
     52  * <p>For very small absolute values, the difference is added to a
     53  * middle byte value for single-byte encoded differences.  For
     54  * somewhat larger absolute values, the difference is divided by the
     55  * number of byte values available, the modulo is used for one trail
     56  * byte, and the remainder is added to a lead byte avoiding the
     57  * single-byte range.  For large absolute values, the difference is
     58  * similarly encoded in three bytes. (Syn Wee, I need examples
     59  * here.)</p>
     60  *
     61  * <p>BOCU does not use byte values 0, 1, or 2, but uses all other
     62  * byte values for lead and single bytes, so that the middle range of
     63  * single bytes is as large as possible.</p>
     64  *
     65  * <p>Note that the lead byte ranges overlap some, but that the
     66  * sequences as a whole are well ordered. I.e., even if the lead byte
     67  * is the same for sequences of different lengths, the trail bytes
     68  * establish correct order.  It would be possible to encode slightly
     69  * larger ranges for each length (>1) by subtracting the lower bound
     70  * of the range. However, that would also slow down the calculation.
     71  * (Syn Wee, need an example).</p>
     72  *
     73  * <p>For the actual string encoding, an optimization moves the
     74  * previous code point value to the middle of its Unicode script block
     75  * to minimize the differences in same-script text runs.  (Syn Wee,
     76  * need an example.)</p>
     77  *
     78  * @author Syn Wee Quek
     79  * @since release 2.2, May 3rd 2002
     80  */
     81 public class BOCSU
     82 {
     83     // public methods -------------------------------------------------------
     84 
     85     /**
     86      * Encode the code points of a string as
     87      * a sequence of byte-encoded differences (slope detection),
     88      * preserving lexical order.
     89      *
     90      * <p>Optimize the difference-taking for runs of Unicode text within
     91      * small scripts:
     92      *
     93      * <p>Most small scripts are allocated within aligned 128-blocks of Unicode
     94      * code points. Lexical order is preserved if "prev" is always moved
     95      * into the middle of such a block.
     96      *
     97      * <p>Additionally, "prev" is moved from anywhere in the Unihan
     98      * area into the middle of that area.
     99      * Note that the identical-level run in a sort key is generated from
    100      * NFD text - there are never Hangul characters included.
    101      */
    102     public static int writeIdenticalLevelRun(int prev, CharSequence s, int i, int length, ByteArrayWrapper sink) {
    103         while (i < length) {
    104             // We must have capacity>=SLOPE_MAX_BYTES in case writeDiff() writes that much,
    105             // but we do not want to force the sink to allocate
    106             // for a large min_capacity because we might actually only write one byte.
    107             ensureAppendCapacity(sink, 16, s.length() * 2);
    108             byte[] buffer = sink.bytes;
    109             int capacity = buffer.length;
    110             int p = sink.size;
    111             int lastSafe = capacity - SLOPE_MAX_BYTES_;
    112             while (i < length && p <= lastSafe) {
    113                 if (prev < 0x4e00 || prev >= 0xa000) {
    114                     prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
    115                 } else {
    116                     // Unihan U+4e00..U+9fa5:
    117                     // double-bytes down from the upper end
    118                     prev = 0x9fff - SLOPE_REACH_POS_2_;
    119                 }
    120 
    121                 int c = Character.codePointAt(s, i);
    122                 i += Character.charCount(c);
    123                 if (c == 0xfffe) {
    124                     buffer[p++] = 2;  // merge separator
    125                     prev = 0;
    126                 } else {
    127                     p = writeDiff(c - prev, buffer, p);
    128                     prev = c;
    129                 }
    130             }
    131             sink.size = p;
    132         }
    133         return prev;
    134     }
    135 
    136     private static void ensureAppendCapacity(ByteArrayWrapper sink, int minCapacity, int desiredCapacity) {
    137         int remainingCapacity = sink.bytes.length - sink.size;
    138         if (remainingCapacity >= minCapacity) { return; }
    139         if (desiredCapacity < minCapacity) { desiredCapacity = minCapacity; }
    140         sink.ensureCapacity(sink.size + desiredCapacity);
    141     }
    142 
    143     // private data members --------------------------------------------------
    144 
    145     /**
    146      * Do not use byte values 0, 1, 2 because they are separators in sort keys.
    147      */
    148     private static final int SLOPE_MIN_ = 3;
    149     private static final int SLOPE_MAX_ = 0xff;
    150     private static final int SLOPE_MIDDLE_ = 0x81;
    151     private static final int SLOPE_TAIL_COUNT_ = SLOPE_MAX_ - SLOPE_MIN_ + 1;
    152     private static final int SLOPE_MAX_BYTES_ = 4;
    153 
    154     /**
    155      * Number of lead bytes:
    156      * 1        middle byte for 0
    157      * 2*80=160 single bytes for !=0
    158      * 2*42=84  for double-byte values
    159      * 2*3=6    for 3-byte values
    160      * 2*1=2    for 4-byte values
    161      *
    162      * The sum must be <=SLOPE_TAIL_COUNT.
    163      *
    164      * Why these numbers?
    165      * - There should be >=128 single-byte values to cover 128-blocks
    166      *   with small scripts.
    167      * - There should be >=20902 single/double-byte values to cover Unihan.
    168      * - It helps CJK Extension B some if there are 3-byte values that cover
    169      *   the distance between them and Unihan.
    170      *   This also helps to jump among distant places in the BMP.
    171      * - Four-byte values are necessary to cover the rest of Unicode.
    172      *
    173      * Symmetrical lead byte counts are for convenience.
    174      * With an equal distribution of even and odd differences there is also
    175      * no advantage to asymmetrical lead byte counts.
    176      */
    177     private static final int SLOPE_SINGLE_ = 80;
    178     private static final int SLOPE_LEAD_2_ = 42;
    179     private static final int SLOPE_LEAD_3_ = 3;
    180     //private static final int SLOPE_LEAD_4_ = 1;
    181 
    182     /**
    183      * The difference value range for single-byters.
    184      */
    185     private static final int SLOPE_REACH_POS_1_ = SLOPE_SINGLE_;
    186     private static final int SLOPE_REACH_NEG_1_ = (-SLOPE_SINGLE_);
    187 
    188     /**
    189      * The difference value range for double-byters.
    190      */
    191     private static final int SLOPE_REACH_POS_2_ =
    192         SLOPE_LEAD_2_ * SLOPE_TAIL_COUNT_ + SLOPE_LEAD_2_ - 1;
    193     private static final int SLOPE_REACH_NEG_2_ = (-SLOPE_REACH_POS_2_ - 1);
    194 
    195     /**
    196      * The difference value range for 3-byters.
    197      */
    198     private static final int SLOPE_REACH_POS_3_ = SLOPE_LEAD_3_
    199         * SLOPE_TAIL_COUNT_
    200         * SLOPE_TAIL_COUNT_
    201         + (SLOPE_LEAD_3_ - 1)
    202         * SLOPE_TAIL_COUNT_ +
    203         (SLOPE_TAIL_COUNT_ - 1);
    204     private static final int SLOPE_REACH_NEG_3_ = (-SLOPE_REACH_POS_3_ - 1);
    205 
    206     /**
    207      * The lead byte start values.
    208      */
    209     private static final int SLOPE_START_POS_2_ = SLOPE_MIDDLE_
    210         + SLOPE_SINGLE_ + 1;
    211     private static final int SLOPE_START_POS_3_ = SLOPE_START_POS_2_
    212         + SLOPE_LEAD_2_;
    213     private static final int SLOPE_START_NEG_2_ = SLOPE_MIDDLE_ +
    214         SLOPE_REACH_NEG_1_;
    215     private static final int SLOPE_START_NEG_3_ = SLOPE_START_NEG_2_
    216         - SLOPE_LEAD_2_;
    217 
    218     // private constructor ---------------------------------------------------
    219 
    220     /**
    221      * Constructor private to prevent initialization
    222      */
    223     ///CLOVER:OFF
    224     private BOCSU()
    225     {
    226     }
    227     ///CLOVER:ON
    228 
    229     // private methods -------------------------------------------------------
    230 
    231     /**
    232      * Integer division and modulo with negative numerators
    233      * yields negative modulo results and quotients that are one more than
    234      * what we need here.
    235      * @param number which operations are to be performed on
    236      * @param factor the factor to use for division
    237      * @return (result of division) << 32 | modulo
    238      */
    239     private static final long getNegDivMod(int number, int factor)
    240     {
    241         int modulo = number % factor;
    242         long result = number / factor;
    243         if (modulo < 0) {
    244             -- result;
    245             modulo += factor;
    246         }
    247         return (result << 32) | modulo;
    248     }
    249 
    250     /**
    251      * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
    252      * preserving lexical order
    253      * @param diff
    254      * @param buffer byte buffer to append to
    255      * @param offset to the byte buffer to start appending
    256      * @return end offset where the appending stops
    257      */
    258     private static final int writeDiff(int diff, byte buffer[], int offset)
    259     {
    260         if (diff >= SLOPE_REACH_NEG_1_) {
    261             if (diff <= SLOPE_REACH_POS_1_) {
    262                 buffer[offset ++] = (byte)(SLOPE_MIDDLE_ + diff);
    263             }
    264             else if (diff <= SLOPE_REACH_POS_2_) {
    265                 buffer[offset ++] = (byte)(SLOPE_START_POS_2_
    266                                            + (diff / SLOPE_TAIL_COUNT_));
    267                 buffer[offset ++] = (byte)(SLOPE_MIN_ +
    268                                            (diff % SLOPE_TAIL_COUNT_));
    269             }
    270             else if (diff <= SLOPE_REACH_POS_3_) {
    271                 buffer[offset + 2] = (byte)(SLOPE_MIN_
    272                                             + (diff % SLOPE_TAIL_COUNT_));
    273                 diff /= SLOPE_TAIL_COUNT_;
    274                 buffer[offset + 1] = (byte)(SLOPE_MIN_
    275                                             + (diff % SLOPE_TAIL_COUNT_));
    276                 buffer[offset] = (byte)(SLOPE_START_POS_3_
    277                                         + (diff / SLOPE_TAIL_COUNT_));
    278                 offset += 3;
    279             }
    280             else {
    281                 buffer[offset + 3] = (byte)(SLOPE_MIN_
    282                                             + diff % SLOPE_TAIL_COUNT_);
    283                 diff /= SLOPE_TAIL_COUNT_;
    284                 buffer[offset + 2] = (byte)(SLOPE_MIN_
    285                                         + diff % SLOPE_TAIL_COUNT_);
    286                 diff /= SLOPE_TAIL_COUNT_;
    287                 buffer[offset + 1] = (byte)(SLOPE_MIN_
    288                                             + diff % SLOPE_TAIL_COUNT_);
    289                 buffer[offset] = (byte)SLOPE_MAX_;
    290                 offset += 4;
    291             }
    292         }
    293         else {
    294             long division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
    295             int modulo = (int)division;
    296             if (diff >= SLOPE_REACH_NEG_2_) {
    297                 diff = (int)(division >> 32);
    298                 buffer[offset ++] = (byte)(SLOPE_START_NEG_2_ + diff);
    299                 buffer[offset ++] = (byte)(SLOPE_MIN_ + modulo);
    300             }
    301             else if (diff >= SLOPE_REACH_NEG_3_) {
    302                 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);
    303                 diff = (int)(division >> 32);
    304                 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
    305                 modulo = (int)division;
    306                 diff = (int)(division >> 32);
    307                 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);
    308                 buffer[offset] = (byte)(SLOPE_START_NEG_3_ + diff);
    309                 offset += 3;
    310             }
    311             else {
    312                 buffer[offset + 3] = (byte)(SLOPE_MIN_ + modulo);
    313                 diff = (int)(division >> 32);
    314                 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
    315                 modulo = (int)division;
    316                 diff = (int)(division >> 32);
    317                 buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);
    318                 division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
    319                 modulo = (int)division;
    320                 buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);
    321                 buffer[offset] = SLOPE_MIN_;
    322                 offset += 4;
    323             }
    324         }
    325         return offset;
    326     }
    327 }
    328