Home | History | Annotate | Download | only in util
      1 /**
      2  *******************************************************************************
      3  * Copyright (C) 1996-2012, International Business Machines Corporation and    *
      4  * others. All Rights Reserved.                                                *
      5  **********************************************************************
      6  * Author: Mark Davis
      7  **********************************************************************
      8  */
      9 
     10 package com.ibm.icu.dev.util;
     11 
     12 import java.io.IOException;
     13 import java.text.FieldPosition;
     14 import java.util.Comparator;
     15 import java.util.TreeSet;
     16 
     17 import com.ibm.icu.impl.Utility;
     18 import com.ibm.icu.lang.UCharacter;
     19 import com.ibm.icu.text.StringTransform;
     20 import com.ibm.icu.text.UTF16;
     21 import com.ibm.icu.text.UTF16.StringComparator;
     22 import com.ibm.icu.text.UnicodeSet;
     23 import com.ibm.icu.text.UnicodeSetIterator;
     24 
     25 /** Provides more flexible formatting of UnicodeSet patterns.
     26  */
     27 public class PrettyPrinter {
     28     private static final StringComparator CODEPOINT_ORDER = new UTF16.StringComparator(true,false,0);
     29     private static final UnicodeSet PATTERN_WHITESPACE = (UnicodeSet) new UnicodeSet("[[:Cn:][:Default_Ignorable_Code_Point:][:patternwhitespace:]]").freeze();
     30     private static final UnicodeSet SORT_AT_END = (UnicodeSet) new UnicodeSet("[[:Cn:][:Cs:][:Co:][:Ideographic:]]").freeze();
     31     private static final UnicodeSet QUOTED_SYNTAX = (UnicodeSet) new UnicodeSet("[\\[\\]\\-\\^\\&\\\\\\{\\}\\$\\:]").addAll(PATTERN_WHITESPACE).freeze();
     32 
     33     private boolean first = true;
     34     private StringBuffer target = new StringBuffer();
     35     private int firstCodePoint = -2;
     36     private int lastCodePoint = -2;
     37     private boolean compressRanges = true;
     38     private String lastString = "";
     39     private UnicodeSet toQuote = new UnicodeSet(PATTERN_WHITESPACE);
     40     private StringTransform quoter = null;
     41 
     42     private Comparator<String> ordering;
     43     private Comparator<String> spaceComp;
     44 
     45     public PrettyPrinter() {
     46     }
     47 
     48     public StringTransform getQuoter() {
     49         return quoter;
     50     }
     51 
     52     public PrettyPrinter setQuoter(StringTransform quoter) {
     53         this.quoter = quoter;
     54         return this; // for chaining
     55     }
     56 
     57     public boolean isCompressRanges() {
     58         return compressRanges;
     59     }
     60 
     61     /**
     62      * @param compressRanges if you want abcde instead of a-e, make this false
     63      * @return
     64      */
     65     public PrettyPrinter setCompressRanges(boolean compressRanges) {
     66         this.compressRanges = compressRanges;
     67         return this;
     68     }
     69 
     70     public Comparator<String> getOrdering() {
     71         return ordering;
     72     }
     73 
     74     /**
     75      * @param ordering the resulting  ordering of the list of characters in the pattern
     76      * @return
     77      */
     78     public PrettyPrinter setOrdering(Comparator ordering) {
     79         this.ordering = ordering == null ? CODEPOINT_ORDER : new com.ibm.icu.impl.MultiComparator<String>(ordering, CODEPOINT_ORDER);
     80         return this;
     81     }
     82 
     83     public Comparator<String> getSpaceComparator() {
     84         return spaceComp;
     85     }
     86 
     87     /**
     88      * @param spaceComp if the comparison returns non-zero, then a space will be inserted between characters
     89      * @return this, for chaining
     90      */
     91     public PrettyPrinter setSpaceComparator(Comparator spaceComp) {
     92         this.spaceComp = spaceComp;
     93         return this;
     94     }
     95 
     96     public UnicodeSet getToQuote() {
     97         return toQuote;
     98     }
     99 
    100     /**
    101      * a UnicodeSet of extra characters to quote with \\uXXXX-style escaping (will automatically quote pattern whitespace)
    102      * @param toQuote
    103      */
    104     public PrettyPrinter setToQuote(UnicodeSet toQuote) {
    105         if (toQuote != null) {
    106             toQuote = (UnicodeSet)toQuote.cloneAsThawed();
    107             toQuote.addAll(PATTERN_WHITESPACE);
    108             this.toQuote = toQuote;
    109         }
    110         return this;
    111     }
    112 
    113 
    114     /**
    115      * Get the pattern for a particular set.
    116      * @param uset
    117      * @return formatted UnicodeSet
    118      */
    119     public String format(UnicodeSet uset) {
    120         first = true;
    121         UnicodeSet putAtEnd = new UnicodeSet(uset).retainAll(SORT_AT_END); // remove all the unassigned gorp for now
    122         // make sure that comparison separates all strings, even canonically equivalent ones
    123         TreeSet<String> orderedStrings = new TreeSet<String>(ordering);
    124         for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.nextRange();) {
    125             if (it.codepoint == UnicodeSetIterator.IS_STRING) {
    126                 orderedStrings.add(it.string);
    127             } else {
    128                 for (int i = it.codepoint; i <= it.codepointEnd; ++i) {
    129                     if (!putAtEnd.contains(i)) {
    130                         orderedStrings.add(UTF16.valueOf(i));
    131                     }
    132                 }
    133             }
    134         }
    135         target.setLength(0);
    136         target.append("[");
    137         for (String item : orderedStrings) {
    138             appendUnicodeSetItem(item);
    139         }
    140         for (UnicodeSetIterator it = new UnicodeSetIterator(putAtEnd); it.next();) { // add back the unassigned gorp
    141             appendUnicodeSetItem(it.codepoint); // we know that these are only codepoints, not strings, so this is safe
    142         }
    143         flushLast();
    144         target.append("]");
    145         String sresult = target.toString();
    146 
    147         // double check the results. This can be removed once we have more tests.
    148         //        try {
    149         //            UnicodeSet  doubleCheck = new UnicodeSet(sresult);
    150         //            if (!uset.equals(doubleCheck)) {
    151         //                throw new IllegalStateException("Failure to round-trip in pretty-print " + uset + " => " + sresult + Utility.LINE_SEPARATOR + " source-result: " + new UnicodeSet(uset).removeAll(doubleCheck) +  Utility.LINE_SEPARATOR + " result-source: " + new UnicodeSet(doubleCheck).removeAll(uset));
    152         //            }
    153         //        } catch (RuntimeException e) {
    154         //            throw (RuntimeException) new IllegalStateException("Failure to round-trip in pretty-print " + uset).initCause(e);
    155         //        }
    156         return sresult;
    157     }
    158 
    159     private PrettyPrinter appendUnicodeSetItem(String s) {
    160         if (UTF16.hasMoreCodePointsThan(s, 1)) {
    161             flushLast();
    162             addSpaceAsNeededBefore(s);
    163             appendQuoted(s);
    164             lastString = s;
    165         } else {
    166             appendUnicodeSetItem(UTF16.charAt(s, 0));
    167         }
    168         return this;
    169     }
    170 
    171     private void appendUnicodeSetItem(int cp) {
    172         if (!compressRanges)
    173             flushLast();
    174         if (cp == lastCodePoint + 1) {
    175             lastCodePoint = cp; // continue range
    176         } else { // start range
    177             flushLast();
    178             firstCodePoint = lastCodePoint = cp;
    179         }
    180     }
    181     /**
    182      *
    183      */
    184     private void addSpaceAsNeededBefore(String s) {
    185         if (first) {
    186             first = false;
    187         } else if (spaceComp != null && spaceComp.compare(s, lastString) != 0) {
    188             target.append(' ');
    189         } else {
    190             int cp = UTF16.charAt(s,0);
    191             if (!toQuote.contains(cp) && !QUOTED_SYNTAX.contains(cp)) {
    192                 int type = UCharacter.getType(cp);
    193                 if (type == UCharacter.NON_SPACING_MARK || type == UCharacter.ENCLOSING_MARK) {
    194                     target.append(' ');
    195                 } else if (type == UCharacter.SURROGATE && cp >= UTF16.TRAIL_SURROGATE_MIN_VALUE) {
    196                     target.append(' '); // make sure we don't accidentally merge two surrogates
    197                 }
    198             }
    199         }
    200     }
    201 
    202     private void addSpaceAsNeededBefore(int codepoint) {
    203         addSpaceAsNeededBefore(UTF16.valueOf(codepoint));
    204     }
    205 
    206     private void flushLast() {
    207         if (lastCodePoint >= 0) {
    208             addSpaceAsNeededBefore(firstCodePoint);
    209             if (firstCodePoint != lastCodePoint) {
    210                 appendQuoted(firstCodePoint);
    211                 if (firstCodePoint + 1 != lastCodePoint) {
    212                     target.append('-');
    213                 } else {
    214                     addSpaceAsNeededBefore(lastCodePoint);
    215                 }
    216             }
    217             appendQuoted(lastCodePoint);
    218             lastString = UTF16.valueOf(lastCodePoint);
    219             firstCodePoint = lastCodePoint = -2;
    220         }
    221     }
    222 
    223 
    224     private void appendQuoted(String s) {
    225         if (toQuote.containsSome(s) && quoter != null) {
    226             target.append(quoter.transform(s));
    227         } else {
    228             int cp;
    229             target.append("{");
    230             for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
    231                 appendQuoted(cp = UTF16.charAt(s, i));
    232             }
    233             target.append("}");
    234         }
    235     }
    236 
    237     PrettyPrinter appendQuoted(int codePoint) {
    238         if (toQuote.contains(codePoint)) {
    239             if (quoter != null) {
    240                 target.append(quoter.transform(UTF16.valueOf(codePoint)));
    241                 return this;
    242             }
    243             if (codePoint > 0xFFFF) {
    244                 target.append("\\U");
    245                 target.append(Utility.hex(codePoint,8));
    246             } else {
    247                 target.append("\\u");
    248                 target.append(Utility.hex(codePoint,4));
    249             }
    250             return this;
    251         }
    252         switch (codePoint) {
    253         case '[': // SET_OPEN:
    254         case ']': // SET_CLOSE:
    255         case '-': // HYPHEN:
    256         case '^': // COMPLEMENT:
    257         case '&': // INTERSECTION:
    258         case '\\': //BACKSLASH:
    259         case '{':
    260         case '}':
    261         case '$':
    262         case ':':
    263             target.append('\\');
    264             break;
    265         default:
    266             // Escape whitespace
    267             if (PATTERN_WHITESPACE.contains(codePoint)) {
    268                 target.append('\\');
    269             }
    270             break;
    271         }
    272         UTF16.append(target, codePoint);
    273         return this;
    274     }
    275     //  Appender append(String s) {
    276     //  target.append(s);
    277     //  return this;
    278     //  }
    279     //  public String toString() {
    280     //  return target.toString();
    281     //  }
    282 
    283     public Appendable format(UnicodeSet obj, Appendable toAppendTo, FieldPosition pos) {
    284         try {
    285             return toAppendTo.append(format(obj));
    286         } catch (IOException e) {
    287             throw new IllegalArgumentException(e);
    288         }
    289     }
    290 }
    291