Home | History | Annotate | Download | only in util
      1 /*
      2  ******************************************************************************
      3  * Copyright (C) 2004-2005, International Business Machines Corporation and        *
      4  * others. All Rights Reserved.                                               *
      5  ******************************************************************************
      6  *
      7  * in shell:  (such as .cldrrc)
      8  *   export CWDEBUG="-DCLDR_DTD_CACHE=/tmp/cldrdtd/"
      9  *   export CWDEFS="-DCLDR_DTD_CACHE_DEBUG=y ${CWDEBUG}"
     10  *
     11  *
     12  * in code:
     13  *   docBuilder.setEntityResolver(new CachingEntityResolver());
     14  *
     15  */
     16 package org.unicode.cldr.util;
     17 
     18 import java.util.Iterator;
     19 import java.util.Map;
     20 import java.util.TreeMap;
     21 
     22 import com.ibm.icu.text.Normalizer;
     23 import com.ibm.icu.text.UTF16;
     24 import com.ibm.icu.text.UnicodeSet;
     25 
     26 public class ExtractCollationRules {
     27     Map<String, String> type_rules = new TreeMap<String, String>();
     28     XPathParts parts = new XPathParts();
     29     StringBuffer rules = new StringBuffer();
     30 
     31     public ExtractCollationRules set(CLDRFile file) {
     32         type_rules.clear();
     33         String lastType = "";
     34         rules.setLength(0);
     35 
     36         String context = null;
     37 
     38         for (Iterator it = file.iterator("//ldml/collations", file.getComparator()); it.hasNext();) {
     39 
     40             // System.out.print(rules.substring(lastLen, rules.length()));
     41             // lastLen = rules.length();
     42 
     43             String path = (String) it.next();
     44             String value = file.getStringValue(path);
     45             parts.set(path);
     46             String type = parts.findAttributeValue("collation", "type");
     47             if (!type.equals(lastType)) {
     48                 lastType = type;
     49                 type_rules.put(lastType, rules.toString());
     50                 rules.setLength(0);
     51             }
     52             String mainType = parts.getElement(3);
     53             // base?, settings?, suppress_contractions?, optimize?
     54             // x: context?, ( p | pc | s | sc | t | tc | i | ic )*, extend?
     55             if (mainType.equals("settings")) {
     56                 writeSettings(parts.getAttributes(3), rules);
     57                 continue;
     58             } else if (mainType.equals("rules")) {
     59                 String ruleType = parts.getElement(4);
     60                 char c = ruleType.charAt(0);
     61                 if (c == 'x') {
     62                     ruleType = parts.getElement(5);
     63                     c = ruleType.charAt(0);
     64                 }
     65                 boolean isMultiple = ruleType.length() > 1 && ruleType.charAt(1) == 'c';
     66                 String lastContext = context;
     67                 context = null;
     68                 switch (c) {
     69                 case 'r':
     70                     appendOrdering("&", null, value, false, true);
     71                     break;
     72                 case 'p':
     73                     appendOrdering("<", lastContext, value, isMultiple, true);
     74                     break;
     75                 case 's':
     76                     appendOrdering("<<", lastContext, value, isMultiple, true);
     77                     break;
     78                 case 't':
     79                     appendOrdering("<<<", lastContext, value, isMultiple, false);
     80                     break;
     81                 case 'i':
     82                     appendOrdering("=", lastContext, value, isMultiple, false);
     83                     break;
     84                 case 'c':
     85                     context = value;
     86                     break;
     87                 case 'e':
     88                     appendOrdering("/", null, value, false, false);
     89                     break;
     90                 default:
     91                     System.out.println("Couldn't handle: " + path + "\t" + value);
     92                 }
     93                 continue;
     94             } else {
     95 
     96             }
     97             System.out.println("Couldn't handle: " + path + "\t" + value);
     98         }
     99         type_rules.put(lastType, rules.toString());
    100         return this;
    101     }
    102 
    103     private void appendOrdering(String relation, String context, String valueAfter, boolean isMultiple,
    104         boolean lineBreakBefore) {
    105         if (isMultiple) {
    106             int cp;
    107             for (int i = 0; i < valueAfter.length(); i += UTF16.getCharCount(cp)) {
    108                 cp = UTF16.charAt(valueAfter, i);
    109                 if (lineBreakBefore)
    110                     rules.append(CldrUtility.LINE_SEPARATOR);
    111                 else
    112                     rules.append(' ');
    113                 rules.append(relation);
    114                 if (context != null) rules.append(' ').append(quote(context));
    115                 rules.append(' ').append(quote(UTF16.valueOf(cp)));
    116             }
    117         } else {
    118             if (lineBreakBefore)
    119                 rules.append(CldrUtility.LINE_SEPARATOR);
    120             else
    121                 rules.append(' ');
    122             rules.append(relation);
    123             if (context != null) rules.append(' ').append(quote(context));
    124             rules.append(' ').append(quote(valueAfter));
    125         }
    126     }
    127 
    128     private void writeSettings(Map<String, String> attributes, StringBuffer results) {
    129         for (Iterator<String> it = attributes.keySet().iterator(); it.hasNext();) {
    130             String attribute = it.next();
    131             String value = attributes.get(attribute);
    132             // TODO fix different cases
    133             results.append("[" + attribute + " " + value + "]" + CldrUtility.LINE_SEPARATOR);
    134             // if (attribute.equals("normalization")) {
    135             //
    136             // }
    137         }
    138     }
    139 
    140     public Iterator<String> iterator() {
    141         return type_rules.keySet().iterator();
    142     }
    143 
    144     public String getRules(Object key) {
    145         return (String) type_rules.get(key);
    146     }
    147 
    148     static StringBuffer quoteOperandBuffer = new StringBuffer(); // faster
    149 
    150     static UnicodeSet needsQuoting = null;
    151     static UnicodeSet needsUnicodeForm = null;
    152 
    153     static final String quote(String s) {
    154         if (needsQuoting == null) {
    155             /*
    156              * c >= 'a' && c <= 'z'
    157              * || c >= 'A' && c <= 'Z'
    158              * || c >= '0' && c <= '9'
    159              * || (c >= 0xA0 && !UCharacterProperty.isRuleWhiteSpace(c))
    160              */
    161             needsQuoting = new UnicodeSet(
    162                 "[[:whitespace:][:c:][:z:][:ascii:]-[a-zA-Z0-9]]"); //
    163             // "[[:ascii:]-[a-zA-Z0-9]-[:c:]-[:z:]]"); // [:whitespace:][:c:][:z:]
    164             // for (int i = 0; i <= 0x10FFFF; ++i) {
    165             // if (UCharacterProperty.isRuleWhiteSpace(i)) needsQuoting.add(i);
    166             // }
    167             // needsQuoting.remove();
    168             needsUnicodeForm = new UnicodeSet("[\\u000d\\u000a[:zl:][:zp:]]");
    169         }
    170         s = Normalizer.compose(s, false);
    171         quoteOperandBuffer.setLength(0);
    172         boolean noQuotes = true;
    173         boolean inQuote = false;
    174         int cp;
    175         for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
    176             cp = UTF16.charAt(s, i);
    177             if (!needsQuoting.contains(cp)) {
    178                 if (inQuote) {
    179                     quoteOperandBuffer.append('\'');
    180                     inQuote = false;
    181                 }
    182                 quoteOperandBuffer.append(UTF16.valueOf(cp));
    183             } else {
    184                 noQuotes = false;
    185                 if (cp == '\'') {
    186                     quoteOperandBuffer.append("''");
    187                 } else {
    188                     if (!inQuote) {
    189                         quoteOperandBuffer.append('\'');
    190                         inQuote = true;
    191                     }
    192                     if (!needsUnicodeForm.contains(cp))
    193                         quoteOperandBuffer.append(UTF16.valueOf(cp)); // cp != 0x2028
    194                     else if (cp > 0xFFFF) {
    195                         quoteOperandBuffer.append("\\U").append(hex(cp, 8));
    196                     } else if (cp <= 0x20 || cp > 0x7E) {
    197                         quoteOperandBuffer.append("\\u").append(hex(cp, 4));
    198                     } else {
    199                         quoteOperandBuffer.append(UTF16.valueOf(cp));
    200                     }
    201                 }
    202             }
    203             /*
    204              * switch (c) {
    205              * case '<': case '>': case '#': case '=': case '&': case '/':
    206              * quoteOperandBuffer.append('\'').append(c).append('\'');
    207              * break;
    208              * case '\'':
    209              * quoteOperandBuffer.append("''");
    210              * break;
    211              * default:
    212              * if (0 <= c && c < 0x20 || 0x7F <= c && c < 0xA0) {
    213              * quoteOperandBuffer.append("\\u").append(Utility.hex(c));
    214              * break;
    215              * }
    216              * quoteOperandBuffer.append(c);
    217              * break;
    218              * }
    219              */
    220         }
    221         if (inQuote) {
    222             quoteOperandBuffer.append('\'');
    223         }
    224         if (noQuotes) return s; // faster
    225         return quoteOperandBuffer.toString();
    226     }
    227 
    228     static public String hex(long i, int places) {
    229         if (i == Long.MIN_VALUE) return "-8000000000000000";
    230         boolean negative = i < 0;
    231         if (negative) {
    232             i = -i;
    233         }
    234         String result = Long.toString(i, 16).toUpperCase();
    235         if (result.length() < places) {
    236             result = "0000000000000000".substring(result.length(), places) + result;
    237         }
    238         if (negative) {
    239             return '-' + result;
    240         }
    241         return result;
    242     }
    243 
    244 }
    245