impl/coll/ContractionsAndExpansions.java

//  2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* ContractionsAndExpansions.java, ported from collationsets.h/.cpp
*
* C++ version created on: 2013feb09
* created by: Markus W. Scherer
*/

package com.ibm.icu.impl.coll;

import java.util.Iterator;

import com.ibm.icu.impl.Trie2;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.CharsTrie;
import com.ibm.icu.util.CharsTrie.Entry;

public final class ContractionsAndExpansions {
    // C++: The following fields are @internal, only public for access by callback.
    private CollationData data;
    private UnicodeSet contractions;
    private UnicodeSet expansions;
    private CESink sink;
    private boolean addPrefixes;
    private int checkTailored = 0;  // -1: collected tailored  +1: exclude tailored
    private UnicodeSet tailored = new UnicodeSet();
    private UnicodeSet ranges;
    private StringBuilder unreversedPrefix = new StringBuilder();
    private String suffix;
    private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH];

    public static interface CESink {
        void handleCE(long ce);
        void handleExpansion(long ces[], int start, int length);
    }

    public ContractionsAndExpansions(UnicodeSet con, UnicodeSet exp, CESink s, boolean prefixes) {
        contractions = con;
        expansions = exp;
        sink = s;
        addPrefixes = prefixes;
    }

    public void forData(CollationData d) {
        // Add all from the data, can be tailoring or base.
        if (d.base != null) {
            checkTailored = -1;
        }
        data = d;
        Iterator<Trie2.Range> trieIterator = data.trie.iterator();
        Trie2.Range range;
        while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
            enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
        }
        if (d.base == null) {
            return;
        }
        // Add all from the base data but only for un-tailored code points.
        tailored.freeze();
        checkTailored = 1;
        data = d.base;
        trieIterator = data.trie.iterator();
        while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
            enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
        }
    }

    private void enumCnERange(int start, int end, int ce32, ContractionsAndExpansions cne) {
        if (cne.checkTailored == 0) {
            // There is no tailoring.
            // No need to collect nor check the tailored set.
        } else if (cne.checkTailored < 0) {
            // Collect the set of code points with mappings in the tailoring data.
            if (ce32 == Collation.FALLBACK_CE32) {
                return; // fallback to base, not tailored
            } else {
                cne.tailored.add(start, end);
            }
            // checkTailored > 0: Exclude tailored ranges from the base data enumeration.
        } else if (start == end) {
            if (cne.tailored.contains(start)) {
                return;
            }
        } else if (cne.tailored.containsSome(start, end)) {
            if (cne.ranges == null) {
                cne.ranges = new UnicodeSet();
            }
            cne.ranges.set(start, end).removeAll(cne.tailored);
            int count = cne.ranges.getRangeCount();
            for (int i = 0; i < count; ++i) {
                cne.handleCE32(cne.ranges.getRangeStart(i), cne.ranges.getRangeEnd(i), ce32);
            }
        }
        cne.handleCE32(start, end, ce32);
    }

    public void forCodePoint(CollationData d, int c) {
        int ce32 = d.getCE32(c);
        if (ce32 == Collation.FALLBACK_CE32) {
            d = d.base;
            ce32 = d.getCE32(c);
        }
        data = d;
        handleCE32(c, c, ce32);
    }

    private void handleCE32(int start, int end, int ce32) {
        for (;;) {
            if ((ce32 & 0xff) < Collation.SPECIAL_CE32_LOW_BYTE) {
                // !isSpecialCE32()
                if (sink != null) {
                    sink.handleCE(Collation.ceFromSimpleCE32(ce32));
                }
                return;
            }
            switch (Collation.tagFromCE32(ce32)) {
            case Collation.FALLBACK_TAG:
                return;
            case Collation.RESERVED_TAG_3:
            case Collation.BUILDER_DATA_TAG:
            case Collation.LEAD_SURROGATE_TAG:
                // Java porting note: U_INTERNAL_PROGRAM_ERROR is set to errorCode in ICU4C.
                throw new AssertionError(
                        String.format("Unexpected CE32 tag type %d for ce32=0x%08x",
                                Collation.tagFromCE32(ce32), ce32));
            case Collation.LONG_PRIMARY_TAG:
                if (sink != null) {
                    sink.handleCE(Collation.ceFromLongPrimaryCE32(ce32));
                }
                return;
            case Collation.LONG_SECONDARY_TAG:
                if (sink != null) {
                    sink.handleCE(Collation.ceFromLongSecondaryCE32(ce32));
                }
                return;
            case Collation.LATIN_EXPANSION_TAG:
                if (sink != null) {
                    ces[0] = Collation.latinCE0FromCE32(ce32);
                    ces[1] = Collation.latinCE1FromCE32(ce32);
                    sink.handleExpansion(ces, 0, 2);
                }
                // Optimization: If we have a prefix,
                // then the relevant strings have been added already.
                if (unreversedPrefix.length() == 0) {
                    addExpansions(start, end);
                }
                return;
            case Collation.EXPANSION32_TAG:
                if (sink != null) {
                    int idx = Collation.indexFromCE32(ce32);
                    int length = Collation.lengthFromCE32(ce32);
                    for (int i = 0; i < length; ++i) {
                        ces[i] = Collation.ceFromCE32(data.ce32s[idx + i]);
                    }
                    sink.handleExpansion(ces, 0, length);
                }
                // Optimization: If we have a prefix,
                // then the relevant strings have been added already.
                if (unreversedPrefix.length() == 0) {
                    addExpansions(start, end);
                }
                return;
            case Collation.EXPANSION_TAG:
                if (sink != null) {
                    int idx = Collation.indexFromCE32(ce32);
                    int length = Collation.lengthFromCE32(ce32);
                    sink.handleExpansion(data.ces, idx, length);
                }
                // Optimization: If we have a prefix,
                // then the relevant strings have been added already.
                if (unreversedPrefix.length() == 0) {
                    addExpansions(start, end);
                }
                return;
            case Collation.PREFIX_TAG:
                handlePrefixes(start, end, ce32);
                return;
            case Collation.CONTRACTION_TAG:
                handleContractions(start, end, ce32);
                return;
            case Collation.DIGIT_TAG:
                // Fetch the non-numeric-collation CE32 and continue.
                ce32 = data.ce32s[Collation.indexFromCE32(ce32)];
                break;
            case Collation.U0000_TAG:
                assert (start == 0 && end == 0);
                // Fetch the normal ce32 for U+0000 and continue.
                ce32 = data.ce32s[0];
                break;
            case Collation.HANGUL_TAG:
                if (sink != null) {
                    // TODO: This should be optimized,
                    // especially if [start..end] is the complete Hangul range. (assert that)
                    UTF16CollationIterator iter = new UTF16CollationIterator(data);
                    StringBuilder hangul = new StringBuilder(1);
                    for (int c = start; c <= end; ++c) {
                        hangul.setLength(0);
                        hangul.appendCodePoint(c);
                        iter.setText(false, hangul, 0);
                        int length = iter.fetchCEs();
                        // Ignore the terminating non-CE.
                        assert (length >= 2 && iter.getCE(length - 1) == Collation.NO_CE);
                        sink.handleExpansion(iter.getCEs(), 0, length - 1);
                    }
                }
                // Optimization: If we have a prefix,
                // then the relevant strings have been added already.
                if (unreversedPrefix.length() == 0) {
                    addExpansions(start, end);
                }
                return;
            case Collation.OFFSET_TAG:
                // Currently no need to send offset CEs to the sink.
                return;
            case Collation.IMPLICIT_TAG:
                // Currently no need to send implicit CEs to the sink.
                return;
            }
        }
    }

    private void handlePrefixes(int start, int end, int ce32) {
        int index = Collation.indexFromCE32(ce32);
        ce32 = data.getCE32FromContexts(index); // Default if no prefix match.
        handleCE32(start, end, ce32);
        if (!addPrefixes) {
            return;
        }
        CharsTrie.Iterator prefixes = new CharsTrie(data.contexts, index + 2).iterator();
        while (prefixes.hasNext()) {
            Entry e = prefixes.next();
            setPrefix(e.chars);
            // Prefix/pre-context mappings are special kinds of contractions
            // that always yield expansions.
            addStrings(start, end, contractions);
            addStrings(start, end, expansions);
            handleCE32(start, end, e.value);
        }
        resetPrefix();
    }

    void handleContractions(int start, int end, int ce32) {
        int index = Collation.indexFromCE32(ce32);
        if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
            // No match on the single code point.
            // We are underneath a prefix, and the default mapping is just
            // a fallback to the mappings for a shorter prefix.
            assert (unreversedPrefix.length() != 0);
        } else {
            ce32 = data.getCE32FromContexts(index); // Default if no suffix match.
            assert (!Collation.isContractionCE32(ce32));
            handleCE32(start, end, ce32);
        }
        CharsTrie.Iterator suffixes = new CharsTrie(data.contexts, index + 2).iterator();
        while (suffixes.hasNext()) {
            Entry e = suffixes.next();
            suffix = e.chars.toString();
            addStrings(start, end, contractions);
            if (unreversedPrefix.length() != 0) {
                addStrings(start, end, expansions);
            }
            handleCE32(start, end, e.value);
        }
        suffix = null;
    }

    void addExpansions(int start, int end) {
        if (unreversedPrefix.length() == 0 && suffix == null) {
            if (expansions != null) {
                expansions.add(start, end);
            }
        } else {
            addStrings(start, end, expansions);
        }
    }

    void addStrings(int start, int end, UnicodeSet set) {
        if (set == null) {
            return;
        }
        StringBuilder s = new StringBuilder(unreversedPrefix);
        do {
            s.appendCodePoint(start);
            if (suffix != null) {
                s.append(suffix);
            }
            set.add(s);
            s.setLength(unreversedPrefix.length());
        } while (++start <= end);
    }

    // Prefixes are reversed in the data structure.
    private void setPrefix(CharSequence pfx) {
        unreversedPrefix.setLength(0);
        unreversedPrefix.append(pfx).reverse();
    }

    private void resetPrefix() {
        unreversedPrefix.setLength(0);
    }
}