Home | History | Annotate | Download | only in icu
      1 package org.unicode.cldr.icu;
      2 
      3 import java.io.BufferedReader;
      4 import java.io.File;
      5 import java.io.FileNotFoundException;
      6 import java.io.IOException;
      7 import java.io.Reader;
      8 import java.util.ArrayList;
      9 import java.util.Collections;
     10 import java.util.Comparator;
     11 import java.util.List;
     12 import java.util.Set;
     13 import java.util.TreeSet;
     14 
     15 import org.unicode.cldr.draft.FileUtilities;
     16 import org.unicode.cldr.tool.Option.Options;
     17 
     18 import com.ibm.icu.impl.Row;
     19 import com.ibm.icu.impl.Row.R2;
     20 import com.ibm.icu.impl.Utility;
     21 import com.ibm.icu.lang.UCharacter;
     22 import com.ibm.icu.text.UForwardCharacterIterator;
     23 import com.ibm.icu.text.UTF16;
     24 import com.ibm.icu.text.UnicodeSet;
     25 
     26 /**
     27  * Compares the contents of ICU data output while ignoring comments.
     28  *
     29  * @author markdavis, jchye
     30  *
     31  */
     32 public class CompareIcuOutput {
     33     private static final boolean DEBUG = false;
     34 
     35     private static final Options options = new Options(
     36         "Usage: RBChecker [OPTIONS] DIR1 DIR2 FILE_REGEX\n" +
     37             "This program is used to compare the RB text files in two different directories.\n" +
     38             "  Example: org.unicode.cldr.icu.RBChecker olddatadir newdatadir .*")
     39                 .add("sort", 's', null, null, "Sort values for comparison");
     40 
     41     private static final Comparator<String[]> comparator = new Comparator<String[]>() {
     42         @Override
     43         public int compare(String[] arg0, String[] arg1) {
     44             return arg0[0].compareTo(arg1[0]);
     45         }
     46     };
     47 
     48     private static boolean shouldSort = false;
     49 
     50     public static void main(String[] args) throws IOException {
     51         String dir1 = args[0];
     52         String dir2 = args[1];
     53         String regex = args[2];
     54         System.out.println("dir1 " + dir1);
     55         System.out.println("dir2 " + dir2);
     56         System.out.println("regex " + regex);
     57         shouldSort = options.get("sort").doesOccur();
     58         long totaltime = System.currentTimeMillis();
     59         System.out.println("Comparing the contents of text files...");
     60         compareTextFiles(dir1, dir2, regex);
     61         System.out.println("Total time taken: " + (System.currentTimeMillis() - totaltime));
     62     }
     63 
     64     /**
     65      * Parses and compares two ICU textfiles.
     66      *
     67      * @param dir1
     68      * @param dir2
     69      * @param regex
     70      * @throws IOException
     71      */
     72     private static void compareTextFiles(String dir1, String dir2, String regex) throws IOException {
     73         File localeDir = new File(dir1);
     74         if (!localeDir.exists()) localeDir = new File(dir1);
     75         String[] filenames = localeDir.list();
     76         int same = 0, different = 0;
     77         for (String filename : filenames) {
     78             if (!filename.matches(regex + "\\.txt")) continue;
     79             String locale = filename.substring(0, filename.length() - 4);
     80             try {
     81                 IcuData oldData = loadDataFromTextfiles(dir1, locale);
     82                 IcuData newData = loadDataFromTextfiles(dir2, locale);
     83                 StringBuffer messages = new StringBuffer();
     84                 if (analyseMatches(oldData, newData, messages)) {
     85                     System.out.println("=== Differences found for " + locale + " ===");
     86                     System.out.print(messages);
     87                     different++;
     88                 } else {
     89                     same++;
     90                 }
     91             } catch (FileNotFoundException e) {
     92                 System.err.println(locale + " file not found, skipping");
     93             }
     94         }
     95         System.out.println("Check finished with " + different + " different and " + same + " same locales.");
     96     }
     97 
     98     private static IcuData loadDataFromTextfiles(String icuPath, String locale) throws IOException {
     99         List<Row.R2<MyTokenizer.Type, String>> comments = new ArrayList<Row.R2<MyTokenizer.Type, String>>();
    100         IcuData icuData = new IcuData(locale + ".xml", locale, true);
    101         String filename = icuPath + '/' + locale + ".txt";
    102         if (new File(filename).exists()) {
    103             parseRB(filename, icuData, comments);
    104         } else {
    105             throw new FileNotFoundException(filename + " does not exist.");
    106         }
    107         return icuData;
    108     }
    109 
    110     /**
    111      * Computes lists of all differences between two sets of IcuData.
    112      *
    113      * @param oldData
    114      * @param newData
    115      */
    116     private static boolean analyseMatches(IcuData oldData, IcuData newData, StringBuffer buffer) {
    117         boolean hasDifferences = false;
    118         Set<String> missing = new TreeSet<String>(oldData.keySet());
    119         missing.removeAll(newData.keySet());
    120         if (missing.size() > 0) {
    121             buffer.append("Missing paths:\n");
    122             printAllInSet(oldData, missing, buffer);
    123             hasDifferences = true;
    124         }
    125         Set<String> extra = new TreeSet<String>(newData.keySet());
    126         extra.removeAll(oldData.keySet());
    127         if (extra.size() > 0) {
    128             buffer.append("Extra paths:\n");
    129             printAllInSet(newData, extra, buffer);
    130             hasDifferences = true;
    131         }
    132         Set<String> common = new TreeSet<String>(oldData.keySet());
    133         common.retainAll(newData.keySet());
    134         for (String rbPath : common) {
    135             if (rbPath.startsWith("/Version")) continue; // skip version
    136             List<String[]> oldValues = oldData.get(rbPath);
    137             List<String[]> newValues = newData.get(rbPath);
    138             if (shouldSort) {
    139                 Collections.sort(oldValues, comparator);
    140                 Collections.sort(newValues, comparator);
    141             }
    142             // Print out any value differences.
    143             if (valuesDiffer(oldValues, newValues)) {
    144                 buffer.append(rbPath + " contains differences:\n");
    145                 buffer.append("\tOld: ");
    146                 printValues(oldValues, buffer);
    147                 buffer.append("\tNew: ");
    148                 printValues(newValues, buffer);
    149                 hasDifferences = true;
    150             }
    151         }
    152         return hasDifferences;
    153     }
    154 
    155     private static void printAllInSet(IcuData icuData, Set<String> paths, StringBuffer buffer) {
    156         for (String path : paths) {
    157             buffer.append("\t" + path + " = ");
    158             printValues(icuData.get(path), buffer);
    159         }
    160     }
    161 
    162     private static void printValues(List<String[]> values, StringBuffer buffer) {
    163         // Enclose both numbers and strings in quotes for simplicity.
    164         for (String[] array : values) {
    165             if (array.length == 1) {
    166                 buffer.append('"' + array[0] + '"');
    167             } else {
    168                 buffer.append("[");
    169                 for (String value : array) {
    170                     buffer.append('"' + value + "\", ");
    171                 }
    172                 buffer.append("]");
    173             }
    174             buffer.append(", ");
    175         }
    176         buffer.append('\n');
    177     }
    178 
    179     /**
    180      * @param oldValues
    181      * @param newValues
    182      * @return true if the contents of the lists are identical
    183      */
    184     private static boolean valuesDiffer(List<String[]> oldValues, List<String[]> newValues) {
    185         if (oldValues.size() != newValues.size()) return true;
    186         boolean differ = false;
    187         for (int i = 0; i < oldValues.size(); i++) {
    188             String[] oldArray = oldValues.get(i);
    189             String[] newArray = newValues.get(i);
    190             if (oldArray.length != newArray.length) {
    191                 differ = true;
    192                 break;
    193             }
    194             for (int j = 0; j < oldArray.length; j++) {
    195                 // Ignore whitespace.
    196                 if (!oldArray[j].replace(" ", "").equals(newArray[j].replace(" ", ""))) {
    197                     differ = true;
    198                     break;
    199                 }
    200             }
    201         }
    202         return differ;
    203     }
    204 
    205     /**
    206      * Parse an ICU resource bundle into key,value items
    207      *
    208      * @param filename
    209      * @param output
    210      * @param comments
    211      */
    212     static void parseRB(String filename, IcuData icuData, List<R2<MyTokenizer.Type, String>> comments)
    213         throws IOException {
    214         BufferedReader in = null;
    215         File file = new File(filename);
    216         String coreFile = file.getName();
    217         if (!coreFile.endsWith(".txt")) {
    218             throw new IllegalArgumentException("missing .txt in: " + filename);
    219         }
    220         coreFile = coreFile.substring(0, coreFile.length() - 4);
    221         // redo this later on to use fixed PatternTokenizer
    222         in = FileUtilities.openUTF8Reader("", filename);
    223         MyTokenizer tokenIterator = new MyTokenizer(in);
    224         StringBuffer tokenText = new StringBuffer();
    225         List<String> oldPaths = new ArrayList<String>();
    226         List<Integer> indices = new ArrayList<Integer>();
    227         String lastLabel = null;
    228         String path = "";
    229         /*
    230          * AuxExemplarCharacters{
    231          * "[                                     "
    232          * "]" } ExemplarCharacters{
    233          * "[a b c d e f g h i j k l m n o p q r s t u v w x y z]"}
    234          * ExemplarCharactersCurrency
    235          * {"[a b c  d e f g h i j k l  m n o  p q r s t u v w x y z]"}
    236          * ExemplarCharactersIndex
    237          * {"[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z]"}
    238          * ExemplarCharactersPunctuation{"[\-    , ; \: ! ? .  '   \"
    239          *   ( ) \[ \] @ * / \& #     ]"}
    240          */
    241         MyTokenizer.Type lastToken = null;
    242         List<String> arrayValues = null;
    243         while (true) {
    244             MyTokenizer.Type nextToken = tokenIterator.next(tokenText);
    245             if (DEBUG)
    246                 System.out.println(nextToken + "\t" + tokenText);
    247             switch (nextToken) {
    248             case BLOCK_COMMENT:
    249             case LINE_COMMENT:
    250                 if (comments != null) {
    251                     comments.add(Row.of(nextToken, tokenText.toString()));
    252                 }
    253                 continue;
    254             case DONE:
    255                 if (oldPaths.size() != 0) {
    256                     throw new IllegalArgumentException("missing }");
    257                 }
    258                 in.close();
    259                 return;
    260             case ID:
    261                 lastLabel = lastLabel == null ? tokenText.toString() : lastLabel + " " + tokenText;
    262                 break;
    263             case QUOTED:
    264                 if (lastLabel == null) {
    265                     lastLabel = tokenText.toString();
    266                 } else {
    267                     // Remove consecutive quotes.
    268                     lastLabel += tokenText;
    269                 }
    270                 break;
    271             case OPEN_BRACE:
    272                 // Check for array-type values.
    273                 if (lastToken == MyTokenizer.Type.COMMA) {
    274                     arrayValues = new ArrayList<String>();
    275                 } else {
    276                     oldPaths.add(path);
    277                     indices.add(0);
    278                     if (lastToken == MyTokenizer.Type.OPEN_BRACE || lastToken == MyTokenizer.Type.CLOSE_BRACE) {
    279                         int currentIndexPos = indices.size() - 2;
    280                         int currentIndex = indices.get(currentIndexPos);
    281                         lastLabel = "<" + currentIndex + ">";
    282                         indices.set(currentIndexPos, currentIndex + 1);
    283                     } else if (lastLabel.contains(":") && !lastLabel.contains(":int") && !lastLabel.contains(":alias")
    284                         || path.endsWith("/relative")) {
    285                         lastLabel = '"' + lastLabel + '"';
    286                     }
    287                     path += "/" + lastLabel;
    288                 }
    289                 lastLabel = null;
    290                 break;
    291             case CLOSE_BRACE:
    292                 if (lastLabel != null) {
    293                     addPath(path, lastLabel, icuData);
    294                     lastLabel = null;
    295                 }
    296 
    297                 if (arrayValues == null) {
    298                     path = oldPaths.remove(oldPaths.size() - 1);
    299                     indices.remove(indices.size() - 1);
    300                 } else {
    301                     // Value array closed, add it to the path.
    302                     String[] array = new String[0];
    303                     addPath(path, arrayValues.toArray(array), icuData);
    304                     arrayValues = null;
    305                 }
    306                 if (DEBUG)
    307                     System.out.println("POP:\t" + path);
    308                 break;
    309             case COMMA:
    310                 if (lastToken != MyTokenizer.Type.QUOTED && lastToken != MyTokenizer.Type.ID) {
    311                     throw new IllegalArgumentException(filename + ", " + path + ": Commas can only occur after values ");
    312                 } else if (lastLabel == null) {
    313                     throw new IllegalArgumentException(filename + ": Label missing!");
    314                 }
    315                 if (arrayValues != null) {
    316                     arrayValues.add(lastLabel);
    317                 } else {
    318                     addPath(path, lastLabel, icuData);
    319                 }
    320                 lastLabel = null;
    321                 break;
    322             default:
    323                 throw new IllegalArgumentException("Illegal type in " + filename + ": " + nextToken + "\t" + tokenText
    324                     + "\t" + Utility.hex(tokenText));
    325             }
    326             lastToken = nextToken;
    327         }
    328     }
    329 
    330     private static void addPath(String path, String value, IcuData icuData) {
    331         addPath(path, new String[] { value }, icuData);
    332     }
    333 
    334     private static void addPath(String path, String[] values, IcuData icuData) {
    335         path = path.substring(path.indexOf('/', 1));
    336         icuData.add(path, values);
    337     }
    338 
    339     /**
    340      * Reads in tokens from an ICU data file reader.
    341      * Replace by updated PatternTokenizer someday
    342      *
    343      * @author markdavis
    344      *
    345      */
    346     static class MyTokenizer {
    347         enum Type {
    348             DONE, ID, QUOTED, OPEN_BRACE, CLOSE_BRACE, COMMA, LINE_COMMENT, BLOCK_COMMENT, BROKEN_QUOTE, BROKEN_BLOCK_COMMENT, UNKNOWN
    349         }
    350 
    351         private final UForwardCharacterIterator source;
    352         private final UnicodeSet spaceCharacters = new UnicodeSet("[\\u0000\\uFEFF[:pattern_whitespace:]]");
    353         private final UnicodeSet idCharacters = new UnicodeSet("[-+.():%\"'[:xid_continue:]]");
    354         private final UnicodeSet quoteCharacters = new UnicodeSet("[\"']");
    355 
    356         private int bufferedChar;
    357 
    358         /**
    359          * @param reader
    360          */
    361         public MyTokenizer(Reader reader) {
    362             this.source = new UReaderForwardCharacterIterator(reader);
    363         }
    364 
    365         public Type next(StringBuffer tokenText) {
    366             int cp = getCodePoint();
    367             // Skip all spaces not in quotes.
    368             while (cp >= 0 && spaceCharacters.contains(cp)) {
    369                 cp = getCodePoint();
    370             }
    371 
    372             if (cp == -1) {
    373                 return Type.DONE;
    374             }
    375             tokenText.setLength(0);
    376             if (cp == '/') {
    377                 cp = getCodePoint();
    378                 if (cp == '/') { // line comment
    379                     while (true) {
    380                         cp = getCodePoint();
    381                         if (cp == '\n' || cp < 0) {
    382                             return Type.LINE_COMMENT;
    383                         }
    384                         tokenText.appendCodePoint(cp);
    385                     }
    386                 } else if (cp == '*') { // block comment
    387                     while (true) {
    388                         cp = getCodePoint();
    389                         if (cp < 0) {
    390                             return Type.BROKEN_BLOCK_COMMENT;
    391                         }
    392                         while (cp == '*') {
    393                             int cp2 = getCodePoint();
    394                             if (cp2 < 0) {
    395                                 return Type.BROKEN_BLOCK_COMMENT;
    396                             } else if (cp2 == '/') {
    397                                 return Type.BLOCK_COMMENT;
    398                             }
    399                             tokenText.appendCodePoint(cp);
    400                             cp = cp2;
    401                         }
    402                         tokenText.appendCodePoint(cp);
    403                     }
    404                 } else {
    405                     throw new IllegalArgumentException("/ can only be in quotes or comments");
    406                 }
    407             }
    408             if (quoteCharacters.contains(cp)) {
    409                 // Return the text inside and *excluding* the quotes.
    410                 int oldQuote = cp;
    411                 cp = getCodePoint();
    412                 while (cp != oldQuote) {
    413                     if (cp < 0) {
    414                         return Type.BROKEN_QUOTE;
    415                     } else if (cp == '\\') {
    416                         tokenText.appendCodePoint(cp);
    417                         cp = getCodePoint();
    418                         if (cp < 0) {
    419                             return Type.BROKEN_QUOTE;
    420                         }
    421                     }
    422                     tokenText.appendCodePoint(cp);
    423                     cp = getCodePoint();
    424                 }
    425                 ;
    426                 return Type.QUOTED;
    427             }
    428             if (cp == '{') {
    429                 return Type.OPEN_BRACE;
    430             }
    431             if (cp == '}') {
    432                 return Type.CLOSE_BRACE;
    433             }
    434             if (cp == ',') {
    435                 return Type.COMMA;
    436             }
    437             if (idCharacters.contains(cp)) {
    438                 while (true) {
    439                     tokenText.appendCodePoint(cp);
    440                     cp = getCodePoint();
    441                     if (cp < 0 || !idCharacters.contains(cp)) {
    442                         pushCodePoint(cp);
    443                         return Type.ID;
    444                     }
    445                 }
    446             }
    447             tokenText.appendCodePoint(cp);
    448             return Type.UNKNOWN;
    449         }
    450 
    451         int getCodePoint() {
    452             if (bufferedChar >= 0) {
    453                 int result = bufferedChar;
    454                 bufferedChar = -1;
    455                 return result;
    456             }
    457             return source.nextCodePoint();
    458         }
    459 
    460         void pushCodePoint(int codepoint) {
    461             if (bufferedChar >= 0) {
    462                 throw new IllegalArgumentException("Cannot push twice");
    463             }
    464             bufferedChar = codepoint;
    465         }
    466     }
    467 
    468     public static class UReaderForwardCharacterIterator implements UForwardCharacterIterator {
    469         private Reader reader;
    470         private int bufferedChar = -1;
    471 
    472         /**
    473          * @param reader
    474          */
    475         public UReaderForwardCharacterIterator(Reader reader) {
    476             this.reader = reader;
    477         }
    478 
    479         /*
    480          * (non-Javadoc)
    481          *
    482          * @see com.ibm.icu.text.UForwardCharacterIterator#next()
    483          */
    484         public int next() {
    485             if (bufferedChar >= 0) {
    486                 int temp = bufferedChar;
    487                 bufferedChar = -1;
    488                 return temp;
    489             }
    490             try {
    491                 return reader.read();
    492             } catch (IOException e) {
    493                 throw new IllegalArgumentException(e);
    494             }
    495         }
    496 
    497         /*
    498          * (non-Javadoc)
    499          *
    500          * @see com.ibm.icu.text.UForwardCharacterIterator#nextCodePoint()
    501          */
    502         public int nextCodePoint() {
    503             int ch1 = next();
    504             if (UTF16.isLeadSurrogate((char) ch1)) {
    505                 int bufferedChar = next();
    506                 if (UTF16.isTrailSurrogate((char) bufferedChar)) {
    507                     return UCharacter.getCodePoint((char) ch1,
    508                         (char) bufferedChar);
    509                 }
    510             }
    511             return ch1;
    512         }
    513     }
    514 }
    515