1 /* 2 ****************************************************************************** 3 * Copyright (C) 2004-2005, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ****************************************************************************** 6 * 7 * in shell: (such as .cldrrc) 8 * export CWDEBUG="-DCLDR_DTD_CACHE=/tmp/cldrdtd/" 9 * export CWDEFS="-DCLDR_DTD_CACHE_DEBUG=y ${CWDEBUG}" 10 * 11 * 12 * in code: 13 * docBuilder.setEntityResolver(new CachingEntityResolver()); 14 * 15 */ 16 package org.unicode.cldr.util; 17 18 import java.util.Iterator; 19 import java.util.Map; 20 import java.util.TreeMap; 21 22 import com.ibm.icu.text.Normalizer; 23 import com.ibm.icu.text.UTF16; 24 import com.ibm.icu.text.UnicodeSet; 25 26 public class ExtractCollationRules { 27 Map<String, String> type_rules = new TreeMap<String, String>(); 28 XPathParts parts = new XPathParts(); 29 StringBuffer rules = new StringBuffer(); 30 31 public ExtractCollationRules set(CLDRFile file) { 32 type_rules.clear(); 33 String lastType = ""; 34 rules.setLength(0); 35 36 String context = null; 37 38 for (Iterator it = file.iterator("//ldml/collations", file.getComparator()); it.hasNext();) { 39 40 // System.out.print(rules.substring(lastLen, rules.length())); 41 // lastLen = rules.length(); 42 43 String path = (String) it.next(); 44 String value = file.getStringValue(path); 45 parts.set(path); 46 String type = parts.findAttributeValue("collation", "type"); 47 if (!type.equals(lastType)) { 48 lastType = type; 49 type_rules.put(lastType, rules.toString()); 50 rules.setLength(0); 51 } 52 String mainType = parts.getElement(3); 53 // base?, settings?, suppress_contractions?, optimize? 54 // x: context?, ( p | pc | s | sc | t | tc | i | ic )*, extend? 55 if (mainType.equals("settings")) { 56 writeSettings(parts.getAttributes(3), rules); 57 continue; 58 } else if (mainType.equals("rules")) { 59 String ruleType = parts.getElement(4); 60 char c = ruleType.charAt(0); 61 if (c == 'x') { 62 ruleType = parts.getElement(5); 63 c = ruleType.charAt(0); 64 } 65 boolean isMultiple = ruleType.length() > 1 && ruleType.charAt(1) == 'c'; 66 String lastContext = context; 67 context = null; 68 switch (c) { 69 case 'r': 70 appendOrdering("&", null, value, false, true); 71 break; 72 case 'p': 73 appendOrdering("<", lastContext, value, isMultiple, true); 74 break; 75 case 's': 76 appendOrdering("<<", lastContext, value, isMultiple, true); 77 break; 78 case 't': 79 appendOrdering("<<<", lastContext, value, isMultiple, false); 80 break; 81 case 'i': 82 appendOrdering("=", lastContext, value, isMultiple, false); 83 break; 84 case 'c': 85 context = value; 86 break; 87 case 'e': 88 appendOrdering("/", null, value, false, false); 89 break; 90 default: 91 System.out.println("Couldn't handle: " + path + "\t" + value); 92 } 93 continue; 94 } else { 95 96 } 97 System.out.println("Couldn't handle: " + path + "\t" + value); 98 } 99 type_rules.put(lastType, rules.toString()); 100 return this; 101 } 102 103 private void appendOrdering(String relation, String context, String valueAfter, boolean isMultiple, 104 boolean lineBreakBefore) { 105 if (isMultiple) { 106 int cp; 107 for (int i = 0; i < valueAfter.length(); i += UTF16.getCharCount(cp)) { 108 cp = UTF16.charAt(valueAfter, i); 109 if (lineBreakBefore) 110 rules.append(CldrUtility.LINE_SEPARATOR); 111 else 112 rules.append(' '); 113 rules.append(relation); 114 if (context != null) rules.append(' ').append(quote(context)); 115 rules.append(' ').append(quote(UTF16.valueOf(cp))); 116 } 117 } else { 118 if (lineBreakBefore) 119 rules.append(CldrUtility.LINE_SEPARATOR); 120 else 121 rules.append(' '); 122 rules.append(relation); 123 if (context != null) rules.append(' ').append(quote(context)); 124 rules.append(' ').append(quote(valueAfter)); 125 } 126 } 127 128 private void writeSettings(Map<String, String> attributes, StringBuffer results) { 129 for (Iterator<String> it = attributes.keySet().iterator(); it.hasNext();) { 130 String attribute = it.next(); 131 String value = attributes.get(attribute); 132 // TODO fix different cases 133 results.append("[" + attribute + " " + value + "]" + CldrUtility.LINE_SEPARATOR); 134 // if (attribute.equals("normalization")) { 135 // 136 // } 137 } 138 } 139 140 public Iterator<String> iterator() { 141 return type_rules.keySet().iterator(); 142 } 143 144 public String getRules(Object key) { 145 return (String) type_rules.get(key); 146 } 147 148 static StringBuffer quoteOperandBuffer = new StringBuffer(); // faster 149 150 static UnicodeSet needsQuoting = null; 151 static UnicodeSet needsUnicodeForm = null; 152 153 static final String quote(String s) { 154 if (needsQuoting == null) { 155 /* 156 * c >= 'a' && c <= 'z' 157 * || c >= 'A' && c <= 'Z' 158 * || c >= '0' && c <= '9' 159 * || (c >= 0xA0 && !UCharacterProperty.isRuleWhiteSpace(c)) 160 */ 161 needsQuoting = new UnicodeSet( 162 "[[:whitespace:][:c:][:z:][:ascii:]-[a-zA-Z0-9]]"); // 163 // "[[:ascii:]-[a-zA-Z0-9]-[:c:]-[:z:]]"); // [:whitespace:][:c:][:z:] 164 // for (int i = 0; i <= 0x10FFFF; ++i) { 165 // if (UCharacterProperty.isRuleWhiteSpace(i)) needsQuoting.add(i); 166 // } 167 // needsQuoting.remove(); 168 needsUnicodeForm = new UnicodeSet("[\\u000d\\u000a[:zl:][:zp:]]"); 169 } 170 s = Normalizer.compose(s, false); 171 quoteOperandBuffer.setLength(0); 172 boolean noQuotes = true; 173 boolean inQuote = false; 174 int cp; 175 for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) { 176 cp = UTF16.charAt(s, i); 177 if (!needsQuoting.contains(cp)) { 178 if (inQuote) { 179 quoteOperandBuffer.append('\''); 180 inQuote = false; 181 } 182 quoteOperandBuffer.append(UTF16.valueOf(cp)); 183 } else { 184 noQuotes = false; 185 if (cp == '\'') { 186 quoteOperandBuffer.append("''"); 187 } else { 188 if (!inQuote) { 189 quoteOperandBuffer.append('\''); 190 inQuote = true; 191 } 192 if (!needsUnicodeForm.contains(cp)) 193 quoteOperandBuffer.append(UTF16.valueOf(cp)); // cp != 0x2028 194 else if (cp > 0xFFFF) { 195 quoteOperandBuffer.append("\\U").append(hex(cp, 8)); 196 } else if (cp <= 0x20 || cp > 0x7E) { 197 quoteOperandBuffer.append("\\u").append(hex(cp, 4)); 198 } else { 199 quoteOperandBuffer.append(UTF16.valueOf(cp)); 200 } 201 } 202 } 203 /* 204 * switch (c) { 205 * case '<': case '>': case '#': case '=': case '&': case '/': 206 * quoteOperandBuffer.append('\'').append(c).append('\''); 207 * break; 208 * case '\'': 209 * quoteOperandBuffer.append("''"); 210 * break; 211 * default: 212 * if (0 <= c && c < 0x20 || 0x7F <= c && c < 0xA0) { 213 * quoteOperandBuffer.append("\\u").append(Utility.hex(c)); 214 * break; 215 * } 216 * quoteOperandBuffer.append(c); 217 * break; 218 * } 219 */ 220 } 221 if (inQuote) { 222 quoteOperandBuffer.append('\''); 223 } 224 if (noQuotes) return s; // faster 225 return quoteOperandBuffer.toString(); 226 } 227 228 static public String hex(long i, int places) { 229 if (i == Long.MIN_VALUE) return "-8000000000000000"; 230 boolean negative = i < 0; 231 if (negative) { 232 i = -i; 233 } 234 String result = Long.toString(i, 16).toUpperCase(); 235 if (result.length() < places) { 236 result = "0000000000000000".substring(result.length(), places) + result; 237 } 238 if (negative) { 239 return '-' + result; 240 } 241 return result; 242 } 243 244 } 245