Home | History | Annotate | Download | only in impl
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  ******************************************************************************
      5  * Copyright (C) 1996-2011, International Business Machines Corporation and   *
      6  * others. All Rights Reserved.                                               *
      7  ******************************************************************************
      8  */
      9 
     10 /**
     11  * Store bits (Unicode character properties) in bit set vectors.
     12  *
     13  * This is a port of the C++ class UPropsVectors from ICU4C
     14  *
     15  * @author Shaopeng Jia
     16  * @internal
     17  */
     18 
     19 package com.ibm.icu.impl;
     20 
     21 import java.util.Arrays;
     22 import java.util.Comparator;
     23 
     24 /**
     25  * Unicode Properties Vectors associated with code point ranges.
     26  *
     27  * Rows of primitive integers in a contiguous array store the range limits and
     28  * the properties vectors.
     29  *
     30  * In each row, row[0] contains the start code point and row[1] contains the
     31  * limit code point, which is the start of the next range.
     32  *
     33  * Initially, there is only one range [0..0x110000] with values 0.
     34  *
     35  * It would be possible to store only one range boundary per row, but
     36  * self-contained rows allow to later sort them by contents.
     37  */
     38 public class PropsVectors {
     39     private int v[];
     40     private int columns; // number of columns, plus two for start
     41     // and limit values
     42     private int maxRows;
     43     private int rows;
     44     private int prevRow; // search optimization: remember last row seen
     45     private boolean isCompacted;
     46 
     47     // internal function to compare elements in v and target. Return true iff
     48     // elements in v starting from index1 to index1 + length - 1
     49     // are exactly the same as elements in target
     50     // starting from index2 to index2 + length - 1
     51     private boolean areElementsSame(int index1, int[] target, int index2,
     52             int length) {
     53         for (int i = 0; i < length; ++i) {
     54             if (v[index1 + i] != target[index2 + i]) {
     55                 return false;
     56             }
     57         }
     58         return true;
     59     }
     60 
     61     // internal function which given rangeStart, returns
     62     // index where v[index]<=rangeStart<v[index+1].
     63     // The returned index is a multiple of columns, and therefore
     64     // points to the start of a row.
     65     private int findRow(int rangeStart) {
     66         int index = 0;
     67 
     68         // check the vicinity of the last-seen row (start
     69         // searching with an unrolled loop)
     70 
     71         index = prevRow * columns;
     72         if (rangeStart >= v[index]) {
     73             if (rangeStart < v[index + 1]) {
     74                 // same row as last seen
     75                 return index;
     76             } else {
     77                 index += columns;
     78                 if (rangeStart < v[index + 1]) {
     79                     ++prevRow;
     80                     return index;
     81                 } else {
     82                     index += columns;
     83                     if (rangeStart < v[index + 1]) {
     84                         prevRow += 2;
     85                         return index;
     86                     } else if ((rangeStart - v[index + 1]) < 10) {
     87                         // we are close, continue looping
     88                         prevRow += 2;
     89                         do {
     90                             ++prevRow;
     91                             index += columns;
     92                         } while (rangeStart >= v[index + 1]);
     93                         return index;
     94                     }
     95                 }
     96             }
     97         } else if (rangeStart < v[1]) {
     98             // the very first row
     99             prevRow = 0;
    100             return 0;
    101         }
    102 
    103         // do a binary search for the start of the range
    104         int start = 0;
    105         int mid = 0;
    106         int limit = rows;
    107         while (start < limit - 1) {
    108             mid = (start + limit) / 2;
    109             index = columns * mid;
    110             if (rangeStart < v[index]) {
    111                 limit = mid;
    112             } else if (rangeStart < v[index + 1]) {
    113                 prevRow = mid;
    114                 return index;
    115             } else {
    116                 start = mid;
    117             }
    118         }
    119 
    120         // must be found because all ranges together always cover
    121         // all of Unicode
    122         prevRow = start;
    123         index = start * columns;
    124         return index;
    125     }
    126 
    127     /*
    128      * Special pseudo code points for storing the initialValue and the
    129      * errorValue which are used to initialize a Trie or similar.
    130      */
    131     public final static int FIRST_SPECIAL_CP = 0x110000;
    132     public final static int INITIAL_VALUE_CP = 0x110000;
    133     public final static int ERROR_VALUE_CP = 0x110001;
    134     public final static int MAX_CP = 0x110001;
    135 
    136     public final static int INITIAL_ROWS = 1 << 12;
    137     public final static int MEDIUM_ROWS = 1 << 16;
    138     public final static int MAX_ROWS = MAX_CP + 1;
    139 
    140     /*
    141      * Constructor.
    142      * @param numOfColumns Number of value integers (32-bit int) per row.
    143      */
    144     public PropsVectors(int numOfColumns) {
    145         if (numOfColumns < 1) {
    146             throw new IllegalArgumentException("numOfColumns need to be no "
    147                     + "less than 1; but it is " + numOfColumns);
    148         }
    149         columns = numOfColumns + 2; // count range start and limit columns
    150         v = new int[INITIAL_ROWS * columns];
    151         maxRows = INITIAL_ROWS;
    152         rows = 2 + (MAX_CP - FIRST_SPECIAL_CP);
    153         prevRow = 0;
    154         isCompacted = false;
    155         v[0] = 0;
    156         v[1] = 0x110000;
    157         int index = columns;
    158         for (int cp = FIRST_SPECIAL_CP; cp <= MAX_CP; ++cp) {
    159             v[index] = cp;
    160             v[index + 1] = cp + 1;
    161             index += columns;
    162         }
    163     }
    164 
    165     /*
    166      * In rows for code points [start..end], select the column, reset the mask
    167      * bits and set the value bits (ANDed with the mask).
    168      *
    169      * @throws IllegalArgumentException
    170      *
    171      * @throws IllegalStateException
    172      *
    173      * @throws IndexOutOfBoundsException
    174      */
    175     public void setValue(int start, int end, int column, int value, int mask) {
    176         if (start < 0 || start > end || end > MAX_CP || column < 0
    177                 || column >= (columns - 2)) {
    178             throw new IllegalArgumentException();
    179         }
    180         if (isCompacted) {
    181             throw new IllegalStateException("Shouldn't be called after"
    182                     + "compact()!");
    183         }
    184 
    185         int firstRow, lastRow;
    186         int limit = end + 1;
    187         boolean splitFirstRow, splitLastRow;
    188         // skip range start and limit columns
    189         column += 2;
    190         value &= mask;
    191 
    192         // find the rows whose ranges overlap with the input range
    193         // find the first and last row, always successful
    194         firstRow = findRow(start);
    195         lastRow = findRow(end);
    196 
    197         /*
    198          * Rows need to be split if they partially overlap with the input range
    199          * (only possible for the first and last rows) and if their value
    200          * differs from the input value.
    201          */
    202         splitFirstRow = (start != v[firstRow] && value != (v[firstRow + column] & mask));
    203         splitLastRow = (limit != v[lastRow + 1] && value != (v[lastRow + column] & mask));
    204 
    205         // split first/last rows if necessary
    206         if (splitFirstRow || splitLastRow) {
    207             int rowsToExpand = 0;
    208             if (splitFirstRow) {
    209                 ++rowsToExpand;
    210             }
    211             if (splitLastRow) {
    212                 ++rowsToExpand;
    213             }
    214             int newMaxRows = 0;
    215             if ((rows + rowsToExpand) > maxRows) {
    216                 if (maxRows < MEDIUM_ROWS) {
    217                     newMaxRows = MEDIUM_ROWS;
    218                 } else if (maxRows < MAX_ROWS) {
    219                     newMaxRows = MAX_ROWS;
    220                 } else {
    221                     throw new IndexOutOfBoundsException(
    222                             "MAX_ROWS exceeded! Increase it to a higher value" +
    223                             "in the implementation");
    224                 }
    225                 int[] temp = new int[newMaxRows * columns];
    226                 System.arraycopy(v, 0, temp, 0, rows * columns);
    227                 v = temp;
    228                 maxRows = newMaxRows;
    229             }
    230 
    231             // count the number of row cells to move after the last row,
    232             // and move them
    233             int count = (rows * columns) - (lastRow + columns);
    234             if (count > 0) {
    235                 System.arraycopy(v, lastRow + columns, v, lastRow
    236                         + (1 + rowsToExpand) * columns, count);
    237             }
    238             rows += rowsToExpand;
    239 
    240             // split the first row, and move the firstRow pointer
    241             // to the second part
    242             if (splitFirstRow) {
    243                 // copy all affected rows up one and move the lastRow pointer
    244                 count = lastRow - firstRow + columns;
    245                 System.arraycopy(v, firstRow, v, firstRow + columns, count);
    246                 lastRow += columns;
    247 
    248                 // split the range and move the firstRow pointer
    249                 v[firstRow + 1] = v[firstRow + columns] = start;
    250                 firstRow += columns;
    251             }
    252 
    253             // split the last row
    254             if (splitLastRow) {
    255                 // copy the last row data
    256                 System.arraycopy(v, lastRow, v, lastRow + columns, columns);
    257 
    258                 // split the range and move the firstRow pointer
    259                 v[lastRow + 1] = v[lastRow + columns] = limit;
    260             }
    261         }
    262 
    263         // set the "row last seen" to the last row for the range
    264         prevRow = lastRow / columns;
    265 
    266         // set the input value in all remaining rows
    267         firstRow += column;
    268         lastRow += column;
    269         mask = ~mask;
    270         for (;;) {
    271             v[firstRow] = (v[firstRow] & mask) | value;
    272             if (firstRow == lastRow) {
    273                 break;
    274             }
    275             firstRow += columns;
    276         }
    277     }
    278 
    279     /*
    280      * Always returns 0 if called after compact().
    281      */
    282     public int getValue(int c, int column) {
    283         if (isCompacted || c < 0 || c > MAX_CP || column < 0
    284                 || column >= (columns - 2)) {
    285             return 0;
    286         }
    287         int index = findRow(c);
    288         return v[index + 2 + column];
    289     }
    290 
    291     /*
    292      * Returns an array which contains value elements
    293      * in row rowIndex.
    294      *
    295      * @throws IllegalStateException
    296      * @throws IllegalArgumentException
    297      */
    298     public int[] getRow(int rowIndex) {
    299         if (isCompacted) {
    300             throw new IllegalStateException(
    301                     "Illegal Invocation of the method after compact()");
    302         }
    303         if (rowIndex < 0 || rowIndex > rows) {
    304             throw new IllegalArgumentException("rowIndex out of bound!");
    305         }
    306         int[] rowToReturn = new int[columns - 2];
    307         System.arraycopy(v, rowIndex * columns + 2, rowToReturn, 0,
    308                          columns - 2);
    309         return rowToReturn;
    310     }
    311 
    312     /*
    313      * Returns an int which is the start codepoint
    314      * in row rowIndex.
    315      *
    316      * @throws IllegalStateException
    317      *
    318      * @throws IllegalArgumentException
    319      */
    320     public int getRowStart(int rowIndex) {
    321         if (isCompacted) {
    322             throw new IllegalStateException(
    323                     "Illegal Invocation of the method after compact()");
    324         }
    325         if (rowIndex < 0 || rowIndex > rows) {
    326             throw new IllegalArgumentException("rowIndex out of bound!");
    327         }
    328         return v[rowIndex * columns];
    329     }
    330 
    331     /*
    332      * Returns an int which is the limit codepoint
    333      * minus 1 in row rowIndex.
    334      *
    335      * @throws IllegalStateException
    336      *
    337      * @throws IllegalArgumentException
    338      */
    339     public int getRowEnd(int rowIndex) {
    340         if (isCompacted) {
    341             throw new IllegalStateException(
    342                     "Illegal Invocation of the method after compact()");
    343         }
    344         if (rowIndex < 0 || rowIndex > rows) {
    345             throw new IllegalArgumentException("rowIndex out of bound!");
    346         }
    347         return v[rowIndex * columns + 1] - 1;
    348     }
    349 
    350     /*
    351      * Compact the vectors:
    352      * - modify the memory
    353      * - keep only unique vectors
    354      * - store them contiguously from the beginning of the memory
    355      * - for each (non-unique) row, call the respective function in
    356      *   CompactHandler
    357      *
    358      * The handler's rowIndex is the index of the row in the compacted
    359      * memory block. Therefore, it starts at 0 increases in increments of the
    360      * columns value.
    361      *
    362      * In a first phase, only special values are delivered (each exactly once).
    363      * Then CompactHandler::startRealValues() is called
    364      * where rowIndex is the length of the compacted array.
    365      * Then, in the second phase, the CompactHandler::setRowIndexForRange() is
    366      * called for each row of real values.
    367      */
    368     public void compact(CompactHandler compactor) {
    369         if (isCompacted) {
    370             return;
    371         }
    372 
    373         // Set the flag now: Sorting and compacting destroys the builder
    374         // data structure.
    375         isCompacted = true;
    376         int valueColumns = columns - 2; // not counting start & limit
    377 
    378         // sort the properties vectors to find unique vector values
    379         Integer[] indexArray = new Integer[rows];
    380         for (int i = 0; i < rows; ++i) {
    381             indexArray[i] = Integer.valueOf(columns * i);
    382         }
    383 
    384         Arrays.sort(indexArray, new Comparator<Integer>() {
    385             @Override
    386             public int compare(Integer o1, Integer o2) {
    387                 int indexOfRow1 = o1.intValue();
    388                 int indexOfRow2 = o2.intValue();
    389                 int count = columns; // includes start/limit columns
    390 
    391                 // start comparing after start/limit
    392                 // but wrap around to them
    393                 int index = 2;
    394                 do {
    395                     if (v[indexOfRow1 + index] != v[indexOfRow2 + index]) {
    396                         return v[indexOfRow1 + index] < v[indexOfRow2 + index] ? -1
    397                                 : 1;
    398                     }
    399                     if (++index == columns) {
    400                         index = 0;
    401                     }
    402                 } while (--count > 0);
    403 
    404                 return 0;
    405             }
    406         });
    407 
    408         /*
    409          * Find and set the special values. This has to do almost the same work
    410          * as the compaction below, to find the indexes where the special-value
    411          * rows will move.
    412          */
    413         int count = -valueColumns;
    414         for (int i = 0; i < rows; ++i) {
    415             int start = v[indexArray[i].intValue()];
    416 
    417             // count a new values vector if it is different
    418             // from the current one
    419             if (count < 0 || !areElementsSame(indexArray[i].intValue() + 2, v,
    420                     indexArray[i-1].intValue() + 2, valueColumns)) {
    421                 count += valueColumns;
    422             }
    423 
    424             if (start == INITIAL_VALUE_CP) {
    425                 compactor.setRowIndexForInitialValue(count);
    426             } else if (start == ERROR_VALUE_CP) {
    427                 compactor.setRowIndexForErrorValue(count);
    428             }
    429         }
    430 
    431         // count is at the beginning of the last vector,
    432         // add valueColumns to include that last vector
    433         count += valueColumns;
    434 
    435         // Call the handler once more to signal the start of
    436         // delivering real values.
    437         compactor.startRealValues(count);
    438 
    439         /*
    440          * Move vector contents up to a contiguous array with only unique
    441          * vector values, and call the handler function for each vector.
    442          *
    443          * This destroys the Properties Vector structure and replaces it
    444          * with an array of just vector values.
    445          */
    446         int[] temp = new int[count];
    447         count = -valueColumns;
    448         for (int i = 0; i < rows; ++i) {
    449             int start = v[indexArray[i].intValue()];
    450             int limit = v[indexArray[i].intValue() + 1];
    451 
    452             // count a new values vector if it is different
    453             // from the current one
    454             if (count < 0 || !areElementsSame(indexArray[i].intValue() + 2,
    455                     temp, count, valueColumns)) {
    456                 count += valueColumns;
    457                 System.arraycopy(v, indexArray[i].intValue() + 2, temp, count,
    458                         valueColumns);
    459             }
    460 
    461             if (start < FIRST_SPECIAL_CP) {
    462                 compactor.setRowIndexForRange(start, limit - 1, count);
    463             }
    464         }
    465         v = temp;
    466 
    467         // count is at the beginning of the last vector,
    468         // add one to include that last vector
    469         rows = count / valueColumns + 1;
    470     }
    471 
    472     /*
    473      * Get the vectors array after calling compact().
    474      *
    475      * @throws IllegalStateException
    476      */
    477     public int[] getCompactedArray() {
    478         if (!isCompacted) {
    479             throw new IllegalStateException(
    480                     "Illegal Invocation of the method before compact()");
    481         }
    482         return v;
    483     }
    484 
    485     /*
    486      * Get the number of rows for the compacted array.
    487      *
    488      * @throws IllegalStateException
    489      */
    490     public int getCompactedRows() {
    491         if (!isCompacted) {
    492             throw new IllegalStateException(
    493                     "Illegal Invocation of the method before compact()");
    494         }
    495         return rows;
    496     }
    497 
    498     /*
    499      * Get the number of columns for the compacted array.
    500      *
    501      * @throws IllegalStateException
    502      */
    503     public int getCompactedColumns() {
    504         if (!isCompacted) {
    505             throw new IllegalStateException(
    506                     "Illegal Invocation of the method before compact()");
    507         }
    508         return columns - 2;
    509     }
    510 
    511     /*
    512      * Call compact(), create a IntTrie with indexes into the compacted
    513      * vectors array.
    514      */
    515     public IntTrie compactToTrieWithRowIndexes() {
    516         PVecToTrieCompactHandler compactor = new PVecToTrieCompactHandler();
    517         compact(compactor);
    518         return compactor.builder.serialize(new DefaultGetFoldedValue(
    519                 compactor.builder), new DefaultGetFoldingOffset());
    520     }
    521 
    522     // inner class implementation of Trie.DataManipulate
    523     private static class DefaultGetFoldingOffset implements Trie.DataManipulate {
    524         @Override
    525         public int getFoldingOffset(int value) {
    526             return value;
    527         }
    528     }
    529 
    530     // inner class implementation of TrieBuilder.DataManipulate
    531     private static class DefaultGetFoldedValue implements
    532             TrieBuilder.DataManipulate {
    533         private IntTrieBuilder builder;
    534 
    535         public DefaultGetFoldedValue(IntTrieBuilder inBuilder) {
    536             builder = inBuilder;
    537         }
    538 
    539         @Override
    540         public int getFoldedValue(int start, int offset) {
    541             int initialValue = builder.m_initialValue_;
    542             int limit = start + 0x400;
    543             while (start < limit) {
    544                 boolean[] inBlockZero = new boolean[1];
    545                 int value = builder.getValue(start, inBlockZero);
    546                 if (inBlockZero[0]) {
    547                     start += TrieBuilder.DATA_BLOCK_LENGTH;
    548                 } else if (value != initialValue) {
    549                     return offset;
    550                 } else {
    551                     ++start;
    552                 }
    553             }
    554             return 0;
    555         }
    556     }
    557 
    558     public static interface CompactHandler {
    559         public void setRowIndexForRange(int start, int end, int rowIndex);
    560         public void setRowIndexForInitialValue(int rowIndex);
    561         public void setRowIndexForErrorValue(int rowIndex);
    562         public void startRealValues(int rowIndex);
    563     }
    564 }