Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 //
      4 //    Copyright (C) 2002-2014, International Business Machines Corporation and others.
      5 //    All Rights Reserved.
      6 //
      7 //
      8 
      9 package com.ibm.icu.text;
     10 
     11 import java.io.DataOutputStream;
     12 import java.io.IOException;
     13 import java.io.OutputStream;
     14 import java.util.ArrayList;
     15 import java.util.HashMap;
     16 import java.util.List;
     17 import java.util.Map;
     18 import java.util.Set;
     19 
     20 import com.ibm.icu.impl.Assert;
     21 import com.ibm.icu.impl.ICUBinary;
     22 import com.ibm.icu.impl.ICUDebug;
     23 
     24 class RBBIRuleBuilder {
     25     //   This is the main class for building (compiling) break rules into the tables
     26     //    required by the runtime RBBI engine.
     27     //
     28 
     29     String fDebugEnv;              // controls debug trace output
     30     String fRules;                 // The rule string that we are compiling
     31     RBBIRuleScanner fScanner;      // The scanner.
     32 
     33 
     34     //
     35     //  There are four separate parse trees generated, one for each of the
     36     //    forward rules, reverse rules, safe forward rules and safe reverse rules.
     37     //  This array references the root of each of the trees.
     38     //
     39     RBBINode[]         fTreeRoots = new RBBINode[4];
     40     static final int   fForwardTree = 0;  // Indexes into the above fTreeRoots array
     41     static final int   fReverseTree = 1;  //   for each of the trees.
     42     static final int   fSafeFwdTree = 2;  //   (in C, these are pointer variables and
     43     static final int   fSafeRevTree = 3;  //    there is no array.)
     44     int fDefaultTree = fForwardTree;      // For rules not qualified with a !
     45                                           //   the tree to which they belong to.
     46 
     47     boolean fChainRules;                  // True for chained Unicode TR style rules.
     48                                           // False for traditional regexp rules.
     49 
     50     boolean fLBCMNoChain;                 // True:  suppress chaining of rules on
     51                                           //   chars with LineBreak property == CM.
     52 
     53     boolean fLookAheadHardBreak;          // True:  Look ahead matches cause an
     54                                           // immediate break, no continuing for the
     55                                           // longest match.
     56 
     57     RBBISetBuilder fSetBuilder;           // Set and Character Category builder.
     58     List<RBBINode> fUSetNodes;            // Vector of all uset nodes.
     59     RBBITableBuilder fForwardTables;      // State transition tables
     60     RBBITableBuilder fReverseTables;
     61     RBBITableBuilder fSafeFwdTables;
     62     RBBITableBuilder fSafeRevTables;
     63 
     64     //
     65     // Status {tag} values.   These structures are common to all of the rule sets (Forward, Reverse, etc.).
     66     //
     67     Map<Set<Integer>, Integer> fStatusSets = new HashMap<Set<Integer>, Integer>(); // Status value sets encountered so far.
     68                                                                                    //  Map Key is the set of values.
     69                                                                                    //  Map Value is the runtime array index.
     70 
     71     List<Integer> fRuleStatusVals;        // List of Integer objects.  Has same layout as the
     72                                           //   runtime array of status (tag) values -
     73                                           //     number of values in group 1
     74                                           //        first status value in group 1
     75                                           //        2nd status value in group 1
     76                                           //        ...
     77                                           //     number of values in group 2
     78                                           //        first status value in group 2
     79                                           //        etc.
     80                                           //
     81     // Error codes from ICU4C.
     82     //    using these simplified the porting, and consolidated the
     83     //    creation of Java exceptions
     84     //
     85     static final int U_BRK_ERROR_START = 0x10200;
     86     /**< Start of codes indicating Break Iterator failures */
     87 
     88     static final int U_BRK_INTERNAL_ERROR = 0x10201;
     89     /**< An internal error (bug) was detected.             */
     90 
     91     static final int U_BRK_HEX_DIGITS_EXPECTED = 0x10202;
     92     /**< Hex digits expected as part of a escaped char in a rule. */
     93 
     94     static final int U_BRK_SEMICOLON_EXPECTED = 0x10203;
     95     /**< Missing ';' at the end of a RBBI rule.            */
     96 
     97     static final int U_BRK_RULE_SYNTAX = 0x10204;
     98     /**< Syntax error in RBBI rule.                        */
     99 
    100     static final int U_BRK_UNCLOSED_SET = 0x10205;
    101     /**< UnicodeSet witing an RBBI rule missing a closing ']'.  */
    102 
    103     static final int U_BRK_ASSIGN_ERROR = 0x10206;
    104     /**< Syntax error in RBBI rule assignment statement.   */
    105 
    106     static final int U_BRK_VARIABLE_REDFINITION = 0x10207;
    107     /**< RBBI rule $Variable redefined.                    */
    108 
    109     static final int U_BRK_MISMATCHED_PAREN = 0x10208;
    110     /**< Mis-matched parentheses in an RBBI rule.          */
    111 
    112     static final int U_BRK_NEW_LINE_IN_QUOTED_STRING = 0x10209;
    113     /**< Missing closing quote in an RBBI rule.            */
    114 
    115     static final int U_BRK_UNDEFINED_VARIABLE = 0x1020a;
    116     /**< Use of an undefined $Variable in an RBBI rule.    */
    117 
    118     static final int U_BRK_INIT_ERROR = 0x1020b;
    119     /**< Initialization failure.  Probable missing ICU Data. */
    120 
    121     static final int U_BRK_RULE_EMPTY_SET = 0x1020c;
    122     /**< Rule contains an empty Unicode Set.               */
    123 
    124     static final int U_BRK_UNRECOGNIZED_OPTION = 0x1020d;
    125     /**< !!option in RBBI rules not recognized.            */
    126 
    127     static final int U_BRK_MALFORMED_RULE_TAG = 0x1020e;
    128     /**< The {nnn} tag on a rule is mal formed             */
    129     static final int U_BRK_MALFORMED_SET = 0x1020f;
    130 
    131     static final int U_BRK_ERROR_LIMIT = 0x10210;
    132     /**< This must always be the last value to indicate the limit for Break Iterator failures */
    133 
    134 
    135     //----------------------------------------------------------------------------------------
    136     //
    137     //  Constructor.
    138     //
    139     //----------------------------------------------------------------------------------------
    140     RBBIRuleBuilder(String rules)
    141     {
    142         fDebugEnv       = ICUDebug.enabled("rbbi") ?
    143                             ICUDebug.value("rbbi") : null;
    144         fRules          = rules;
    145         fUSetNodes      = new ArrayList<RBBINode>();
    146         fRuleStatusVals = new ArrayList<Integer>();
    147         fScanner        = new RBBIRuleScanner(this);
    148         fSetBuilder     = new RBBISetBuilder(this);
    149     }
    150 
    151     //----------------------------------------------------------------------------------------
    152     //
    153     //   flattenData() -  Collect up the compiled RBBI rule data and put it into
    154     //                    the format for saving in ICU data files,
    155     //
    156     //                    See the ICU4C file common/rbidata.h for a detailed description.
    157     //
    158     //----------------------------------------------------------------------------------------
    159     static final int align8(int i)
    160     {
    161         return (i + 7) & 0xfffffff8;
    162     }
    163 
    164     void flattenData(OutputStream os) throws IOException {
    165         DataOutputStream dos = new DataOutputStream(os);
    166         int i;
    167 
    168         //  Remove comments and whitespace from the rules to make it smaller.
    169         String strippedRules = RBBIRuleScanner.stripRules(fRules);
    170 
    171         // Calculate the size of each section in the data in bytes.
    172         //   Sizes here are padded up to a multiple of 8 for better memory alignment.
    173         //   Sections sizes actually stored in the header are for the actual data
    174         //     without the padding.
    175         //
    176         int headerSize       = 24 * 4;     // align8(sizeof(RBBIDataHeader));
    177         int forwardTableSize = align8(fForwardTables.getTableSize());
    178         int reverseTableSize = align8(fReverseTables.getTableSize());
    179         // int safeFwdTableSize = align8(fSafeFwdTables.getTableSize());
    180         int safeRevTableSize = align8(fSafeRevTables.getTableSize());
    181         int trieSize         = align8(fSetBuilder.getTrieSize());
    182         int statusTableSize  = align8(fRuleStatusVals.size() * 4);
    183         int rulesSize        = align8((strippedRules.length()) * 2);
    184 
    185         int totalSize = headerSize
    186                 + forwardTableSize
    187                 + /* reverseTableSize */ 0
    188                 + /* safeFwdTableSize */ 0
    189                 + (safeRevTableSize > 0 ? safeRevTableSize : reverseTableSize)
    190                 + statusTableSize + trieSize + rulesSize;
    191         int outputPos = 0;               // Track stream position, starting from RBBIDataHeader.
    192 
    193         //
    194         // Write out an ICU Data Header
    195         //
    196         ICUBinary.writeHeader(RBBIDataWrapper.DATA_FORMAT, RBBIDataWrapper.FORMAT_VERSION, 0, dos);
    197 
    198         //
    199         // Write out the RBBIDataHeader
    200         //
    201         int[] header = new int[RBBIDataWrapper.DH_SIZE];                 // sizeof struct RBBIDataHeader
    202         header[RBBIDataWrapper.DH_MAGIC]         = 0xb1a0;
    203         header[RBBIDataWrapper.DH_FORMATVERSION] = RBBIDataWrapper.FORMAT_VERSION;
    204         header[RBBIDataWrapper.DH_LENGTH]        = totalSize;            // fLength, the total size of all rule sections.
    205         header[RBBIDataWrapper.DH_CATCOUNT]      = fSetBuilder.getNumCharCategories(); // fCatCount.
    206 
    207         // Only save the forward table and the safe reverse table,
    208         // because these are the only ones used at run-time.
    209         //
    210         // For the moment, we still build the other tables if they are present in the rule source files,
    211         // for backwards compatibility. Old rule files need to work, and this is the simplest approach.
    212         //
    213         // Additional backwards compatibility consideration: if no safe rules are provided, consider the
    214         // reverse rules to actually be the safe reverse rules.
    215 
    216         header[RBBIDataWrapper.DH_FTABLE]        = headerSize;           // fFTable
    217         header[RBBIDataWrapper.DH_FTABLELEN]     = forwardTableSize;     // fTableLen
    218 
    219         // Do not save Reverse Table.
    220         header[RBBIDataWrapper.DH_RTABLE]        = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable
    221         header[RBBIDataWrapper.DH_RTABLELEN]     = 0;                    // fRTableLen
    222 
    223         // Do not save the Safe Forward table.
    224         header[RBBIDataWrapper.DH_SFTABLE]       = header[RBBIDataWrapper.DH_RTABLE]
    225                                                      + 0;                // fSTable
    226         header[RBBIDataWrapper.DH_SFTABLELEN]    = 0;                    // fSTableLen
    227 
    228         // Safe reverse table. Use if present, otherwise save regular reverse table as the safe reverse.
    229         header[RBBIDataWrapper.DH_SRTABLE]       = header[RBBIDataWrapper.DH_SFTABLE]
    230                                                      + 0;                // fSRTable
    231         if (safeRevTableSize > 0) {
    232             header[RBBIDataWrapper.DH_SRTABLELEN] = safeRevTableSize;
    233         } else {
    234             assert reverseTableSize > 0;
    235             header[RBBIDataWrapper.DH_SRTABLELEN] = reverseTableSize;
    236         }
    237 
    238         header[RBBIDataWrapper.DH_TRIE]          = header[RBBIDataWrapper.DH_SRTABLE]
    239                                                      + header[RBBIDataWrapper.DH_SRTABLELEN]; // fTrie
    240         header[RBBIDataWrapper.DH_TRIELEN]       = fSetBuilder.getTrieSize(); // fTrieLen
    241         header[RBBIDataWrapper.DH_STATUSTABLE]   = header[RBBIDataWrapper.DH_TRIE]
    242                                                      + header[RBBIDataWrapper.DH_TRIELEN];
    243         header[RBBIDataWrapper.DH_STATUSTABLELEN] = statusTableSize; // fStatusTableLen
    244         header[RBBIDataWrapper.DH_RULESOURCE]    = header[RBBIDataWrapper.DH_STATUSTABLE]
    245                                                      + statusTableSize;
    246         header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRules.length() * 2;
    247         for (i = 0; i < header.length; i++) {
    248             dos.writeInt(header[i]);
    249             outputPos += 4;
    250         }
    251 
    252         // Write out the actual state tables.
    253         short[] tableData;
    254         tableData = fForwardTables.exportTable();
    255         Assert.assrt(outputPos == header[4]);
    256         for (i = 0; i < tableData.length; i++) {
    257             dos.writeShort(tableData[i]);
    258             outputPos += 2;
    259         }
    260 
    261         /* do not write the reverse table
    262         tableData = fReverseTables.exportTable();
    263         Assert.assrt(outputPos == header[6]);
    264         for (i = 0; i < tableData.length; i++) {
    265             dos.writeShort(tableData[i]);
    266             outputPos += 2;
    267         }
    268         */
    269 
    270         /* do not write safe forwards table
    271         Assert.assrt(outputPos == header[8]);
    272         tableData = fSafeFwdTables.exportTable();
    273         for (i = 0; i < tableData.length; i++) {
    274             dos.writeShort(tableData[i]);
    275             outputPos += 2;
    276         }
    277         */
    278 
    279         // Write the safe reverse table.
    280         // If not present, write the plain reverse table (old style rule compatibility)
    281         Assert.assrt(outputPos == header[10]);
    282         if (safeRevTableSize > 0) {
    283             tableData = fSafeRevTables.exportTable();
    284         } else {
    285             tableData = fReverseTables.exportTable();
    286         }
    287         for (i = 0; i < tableData.length; i++) {
    288             dos.writeShort(tableData[i]);
    289             outputPos += 2;
    290         }
    291 
    292         // write out the Trie table
    293         Assert.assrt(outputPos == header[12]);
    294         fSetBuilder.serializeTrie(os);
    295         outputPos += header[13];
    296         while (outputPos % 8 != 0) { // pad to an 8 byte boundary
    297             dos.write(0);
    298             outputPos += 1;
    299         }
    300 
    301         // Write out the status {tag} table.
    302         Assert.assrt(outputPos == header[16]);
    303         for (Integer val : fRuleStatusVals) {
    304             dos.writeInt(val.intValue());
    305             outputPos += 4;
    306         }
    307 
    308         while (outputPos % 8 != 0) { // pad to an 8 byte boundary
    309             dos.write(0);
    310             outputPos += 1;
    311         }
    312 
    313         // Write out the stripped rules (rules with extra spaces removed
    314         //   These go last in the data area, even though they are not last in the header.
    315         Assert.assrt(outputPos == header[14]);
    316         dos.writeChars(strippedRules);
    317         outputPos += strippedRules.length() * 2;
    318         while (outputPos % 8 != 0) { // pad to an 8 byte boundary
    319             dos.write(0);
    320             outputPos += 1;
    321         }
    322     }
    323 
    324     //----------------------------------------------------------------------------------------
    325     //
    326     //  compileRules          compile source rules, placing the compiled form into a output stream
    327     //                        The compiled form is identical to that from ICU4C (Big Endian).
    328     //
    329     //----------------------------------------------------------------------------------------
    330     static void compileRules(String rules, OutputStream os) throws IOException
    331     {
    332         //
    333         // Read the input rules, generate a parse tree, symbol table,
    334         // and list of all Unicode Sets referenced by the rules.
    335         //
    336         RBBIRuleBuilder builder = new RBBIRuleBuilder(rules);
    337         builder.fScanner.parse();
    338 
    339         //
    340         // UnicodeSet processing.
    341         //    Munge the Unicode Sets to create a set of character categories.
    342         //    Generate the mapping tables (TRIE) from input 32-bit characters to
    343         //    the character categories.
    344         //
    345         builder.fSetBuilder.build();
    346 
    347         //
    348         //   Generate the DFA state transition table.
    349         //
    350         builder.fForwardTables = new RBBITableBuilder(builder, fForwardTree);
    351         builder.fReverseTables = new RBBITableBuilder(builder, fReverseTree);
    352         builder.fSafeFwdTables = new RBBITableBuilder(builder, fSafeFwdTree);
    353         builder.fSafeRevTables = new RBBITableBuilder(builder, fSafeRevTree);
    354         builder.fForwardTables.build();
    355         builder.fReverseTables.build();
    356         builder.fSafeFwdTables.build();
    357         builder.fSafeRevTables.build();
    358         if (builder.fDebugEnv != null
    359                 && builder.fDebugEnv.indexOf("states") >= 0) {
    360             builder.fForwardTables.printRuleStatusTable();
    361         }
    362 
    363         //
    364         //   Package up the compiled data, writing it to an output stream
    365         //      in the serialization format.  This is the same as the ICU4C runtime format.
    366         //
    367         builder.flattenData(os);
    368     }
    369 }
    370