Home | History | Annotate | Download | only in util
      1 /**
      2  *******************************************************************************
      3  * Copyright (C) 1996-2012, International Business Machines Corporation and    *
      4  * others. All Rights Reserved.                                                *
      5  **********************************************************************
      6  * Author: Mark Davis
      7  **********************************************************************
      8  */
      9 
     10 package org.unicode.cldr.util;
     11 
     12 import java.io.IOException;
     13 import java.text.FieldPosition;
     14 import java.util.Comparator;
     15 import java.util.TreeSet;
     16 
     17 import com.ibm.icu.impl.Utility;
     18 import com.ibm.icu.lang.UCharacter;
     19 import com.ibm.icu.text.StringTransform;
     20 import com.ibm.icu.text.UTF16;
     21 import com.ibm.icu.text.UTF16.StringComparator;
     22 import com.ibm.icu.text.UnicodeSet;
     23 import com.ibm.icu.text.UnicodeSetIterator;
     24 import com.ibm.icu.util.ICUUncheckedIOException;
     25 
     26 /** Provides more flexible formatting of UnicodeSet patterns.
     27  */
     28 public class UnicodeSetPrettyPrinter {
     29     private static final StringComparator CODEPOINT_ORDER = new UTF16.StringComparator(true, false, 0);
     30     private static final UnicodeSet PATTERN_WHITESPACE = (UnicodeSet) new UnicodeSet("[[:Cn:][:Default_Ignorable_Code_Point:][:patternwhitespace:]]").freeze();
     31     private static final UnicodeSet SORT_AT_END = (UnicodeSet) new UnicodeSet("[[:Cn:][:Cs:][:Co:][:Ideographic:]]").freeze();
     32     private static final UnicodeSet QUOTED_SYNTAX = (UnicodeSet) new UnicodeSet("[\\[\\]\\-\\^\\&\\\\\\{\\}\\$\\:]").addAll(PATTERN_WHITESPACE).freeze();
     33 
     34     private boolean first = true;
     35     private StringBuffer target = new StringBuffer();
     36     private int firstCodePoint = -2;
     37     private int lastCodePoint = -2;
     38     private boolean compressRanges = true;
     39     private String lastString = "";
     40     private UnicodeSet toQuote = new UnicodeSet(PATTERN_WHITESPACE);
     41     private StringTransform quoter = null;
     42 
     43     private Comparator<String> ordering;
     44     private Comparator<String> spaceComp;
     45 
     46     public UnicodeSetPrettyPrinter() {
     47     }
     48 
     49     public StringTransform getQuoter() {
     50         return quoter;
     51     }
     52 
     53     public UnicodeSetPrettyPrinter setQuoter(StringTransform quoter) {
     54         this.quoter = quoter;
     55         return this; // for chaining
     56     }
     57 
     58     public boolean isCompressRanges() {
     59         return compressRanges;
     60     }
     61 
     62     /**
     63      * @param compressRanges if you want abcde instead of a-e, make this false
     64      * @return
     65      */
     66     public UnicodeSetPrettyPrinter setCompressRanges(boolean compressRanges) {
     67         this.compressRanges = compressRanges;
     68         return this;
     69     }
     70 
     71     public Comparator<String> getOrdering() {
     72         return ordering;
     73     }
     74 
     75     /**
     76      * @param ordering the resulting  ordering of the list of characters in the pattern
     77      * @return
     78      */
     79     public UnicodeSetPrettyPrinter setOrdering(Comparator ordering) {
     80         this.ordering = ordering == null ? CODEPOINT_ORDER : new org.unicode.cldr.util.MultiComparator<String>(ordering, CODEPOINT_ORDER);
     81         return this;
     82     }
     83 
     84     public Comparator<String> getSpaceComparator() {
     85         return spaceComp;
     86     }
     87 
     88     /**
     89      * @param spaceComp if the comparison returns non-zero, then a space will be inserted between characters
     90      * @return this, for chaining
     91      */
     92     public UnicodeSetPrettyPrinter setSpaceComparator(Comparator spaceComp) {
     93         this.spaceComp = spaceComp;
     94         return this;
     95     }
     96 
     97     public UnicodeSet getToQuote() {
     98         return toQuote;
     99     }
    100 
    101     /**
    102      * a UnicodeSet of extra characters to quote with \\uXXXX-style escaping (will automatically quote pattern whitespace)
    103      * @param toQuote
    104      */
    105     public UnicodeSetPrettyPrinter setToQuote(UnicodeSet toQuote) {
    106         if (toQuote != null) {
    107             toQuote = (UnicodeSet) toQuote.cloneAsThawed();
    108             toQuote.addAll(PATTERN_WHITESPACE);
    109             this.toQuote = toQuote;
    110         }
    111         return this;
    112     }
    113 
    114     /**
    115      * Get the pattern for a particular set.
    116      * @param uset
    117      * @return formatted UnicodeSet
    118      */
    119     public String format(UnicodeSet uset) {
    120         first = true;
    121         UnicodeSet putAtEnd = new UnicodeSet(uset).retainAll(SORT_AT_END); // remove all the unassigned gorp for now
    122         // make sure that comparison separates all strings, even canonically equivalent ones
    123         TreeSet<String> orderedStrings = new TreeSet<String>(ordering);
    124         for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.nextRange();) {
    125             if (it.codepoint == UnicodeSetIterator.IS_STRING) {
    126                 orderedStrings.add(it.string);
    127             } else {
    128                 for (int i = it.codepoint; i <= it.codepointEnd; ++i) {
    129                     if (!putAtEnd.contains(i)) {
    130                         orderedStrings.add(UTF16.valueOf(i));
    131                     }
    132                 }
    133             }
    134         }
    135         target.setLength(0);
    136         target.append("[");
    137         for (String item : orderedStrings) {
    138             appendUnicodeSetItem(item);
    139         }
    140         for (UnicodeSetIterator it = new UnicodeSetIterator(putAtEnd); it.next();) { // add back the unassigned gorp
    141             appendUnicodeSetItem(it.codepoint); // we know that these are only codepoints, not strings, so this is safe
    142         }
    143         flushLast();
    144         target.append("]");
    145         String sresult = target.toString();
    146 
    147         // double check the results. This can be removed once we have more tests.
    148         //        try {
    149         //            UnicodeSet  doubleCheck = new UnicodeSet(sresult);
    150         //            if (!uset.equals(doubleCheck)) {
    151         //                throw new IllegalStateException("Failure to round-trip in pretty-print " + uset + " => " + sresult + Utility.LINE_SEPARATOR + " source-result: " + new UnicodeSet(uset).removeAll(doubleCheck) +  Utility.LINE_SEPARATOR + " result-source: " + new UnicodeSet(doubleCheck).removeAll(uset));
    152         //            }
    153         //        } catch (RuntimeException e) {
    154         //            throw (RuntimeException) new IllegalStateException("Failure to round-trip in pretty-print " + uset).initCause(e);
    155         //        }
    156         return sresult;
    157     }
    158 
    159     private UnicodeSetPrettyPrinter appendUnicodeSetItem(String s) {
    160         if (UTF16.hasMoreCodePointsThan(s, 1)) {
    161             flushLast();
    162             addSpaceAsNeededBefore(s);
    163             appendQuoted(s);
    164             lastString = s;
    165         } else {
    166             appendUnicodeSetItem(UTF16.charAt(s, 0));
    167         }
    168         return this;
    169     }
    170 
    171     private void appendUnicodeSetItem(int cp) {
    172         if (!compressRanges)
    173             flushLast();
    174         if (cp == lastCodePoint + 1) {
    175             lastCodePoint = cp; // continue range
    176         } else { // start range
    177             flushLast();
    178             firstCodePoint = lastCodePoint = cp;
    179         }
    180     }
    181 
    182     /**
    183      *
    184      */
    185     private void addSpaceAsNeededBefore(String s) {
    186         if (first) {
    187             first = false;
    188         } else if (spaceComp != null && spaceComp.compare(s, lastString) != 0) {
    189             target.append(' ');
    190         } else {
    191             int cp = UTF16.charAt(s, 0);
    192             if (!toQuote.contains(cp) && !QUOTED_SYNTAX.contains(cp)) {
    193                 int type = UCharacter.getType(cp);
    194                 if (type == UCharacter.NON_SPACING_MARK || type == UCharacter.ENCLOSING_MARK) {
    195                     target.append(' ');
    196                 } else if (type == UCharacter.SURROGATE && cp >= UTF16.TRAIL_SURROGATE_MIN_VALUE) {
    197                     target.append(' '); // make sure we don't accidentally merge two surrogates
    198                 }
    199             }
    200         }
    201     }
    202 
    203     private void addSpaceAsNeededBefore(int codepoint) {
    204         addSpaceAsNeededBefore(UTF16.valueOf(codepoint));
    205     }
    206 
    207     private void flushLast() {
    208         if (lastCodePoint >= 0) {
    209             addSpaceAsNeededBefore(firstCodePoint);
    210             if (firstCodePoint != lastCodePoint) {
    211                 appendQuoted(firstCodePoint);
    212                 if (firstCodePoint + 1 != lastCodePoint) {
    213                     target.append('-');
    214                 } else {
    215                     addSpaceAsNeededBefore(lastCodePoint);
    216                 }
    217             }
    218             appendQuoted(lastCodePoint);
    219             lastString = UTF16.valueOf(lastCodePoint);
    220             firstCodePoint = lastCodePoint = -2;
    221         }
    222     }
    223 
    224     private void appendQuoted(String s) {
    225         if (toQuote.containsSome(s) && quoter != null) {
    226             target.append(quoter.transform(s));
    227         } else {
    228             int cp;
    229             target.append("{");
    230             for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
    231                 appendQuoted(cp = UTF16.charAt(s, i));
    232             }
    233             target.append("}");
    234         }
    235     }
    236 
    237     UnicodeSetPrettyPrinter appendQuoted(int codePoint) {
    238         if (toQuote.contains(codePoint)) {
    239             if (quoter != null) {
    240                 target.append(quoter.transform(UTF16.valueOf(codePoint)));
    241                 return this;
    242             }
    243             if (codePoint > 0xFFFF) {
    244                 target.append("\\U");
    245                 target.append(Utility.hex(codePoint, 8));
    246             } else {
    247                 target.append("\\u");
    248                 target.append(Utility.hex(codePoint, 4));
    249             }
    250             return this;
    251         }
    252         switch (codePoint) {
    253         case '[': // SET_OPEN:
    254         case ']': // SET_CLOSE:
    255         case '-': // HYPHEN:
    256         case '^': // COMPLEMENT:
    257         case '&': // INTERSECTION:
    258         case '\\': //BACKSLASH:
    259         case '{':
    260         case '}':
    261         case '$':
    262         case ':':
    263             target.append('\\');
    264             break;
    265         default:
    266             // Escape whitespace
    267             if (PATTERN_WHITESPACE.contains(codePoint)) {
    268                 target.append('\\');
    269             }
    270             break;
    271         }
    272         UTF16.append(target, codePoint);
    273         return this;
    274     }
    275     //  Appender append(String s) {
    276     //  target.append(s);
    277     //  return this;
    278     //  }
    279     //  public String toString() {
    280     //  return target.toString();
    281     //  }
    282 
    283     public Appendable format(UnicodeSet obj, Appendable toAppendTo, FieldPosition pos) {
    284         try {
    285             return toAppendTo.append(format(obj));
    286         } catch (IOException e) {
    287             throw new ICUUncheckedIOException(e);
    288         }
    289     }
    290 }
    291