Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 //
      4 //  file:  rbbirb.cpp
      5 //
      6 //  Copyright (C) 2002-2011, International Business Machines Corporation and others.
      7 //  All Rights Reserved.
      8 //
      9 //  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
     10 //    building (compiling) break rules into the tables required by the runtime
     11 //    RBBI engine.
     12 //
     13 
     14 #include "unicode/utypes.h"
     15 
     16 #if !UCONFIG_NO_BREAK_ITERATION
     17 
     18 #include "unicode/brkiter.h"
     19 #include "unicode/rbbi.h"
     20 #include "unicode/ubrk.h"
     21 #include "unicode/unistr.h"
     22 #include "unicode/uniset.h"
     23 #include "unicode/uchar.h"
     24 #include "unicode/uchriter.h"
     25 #include "unicode/parsepos.h"
     26 #include "unicode/parseerr.h"
     27 
     28 #include "cmemory.h"
     29 #include "cstring.h"
     30 #include "rbbirb.h"
     31 #include "rbbinode.h"
     32 #include "rbbiscan.h"
     33 #include "rbbisetb.h"
     34 #include "rbbitblb.h"
     35 #include "rbbidata.h"
     36 #include "uassert.h"
     37 
     38 
     39 U_NAMESPACE_BEGIN
     40 
     41 
     42 //----------------------------------------------------------------------------------------
     43 //
     44 //  Constructor.
     45 //
     46 //----------------------------------------------------------------------------------------
     47 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
     48                                        UParseError     *parseErr,
     49                                        UErrorCode      &status)
     50  : fRules(rules)
     51 {
     52     fStatus = &status; // status is checked below
     53     fParseError = parseErr;
     54     fDebugEnv   = NULL;
     55 #ifdef RBBI_DEBUG
     56     fDebugEnv   = getenv("U_RBBIDEBUG");
     57 #endif
     58 
     59 
     60     fForwardTree        = NULL;
     61     fReverseTree        = NULL;
     62     fSafeFwdTree        = NULL;
     63     fSafeRevTree        = NULL;
     64     fDefaultTree        = &fForwardTree;
     65     fForwardTables      = NULL;
     66     fReverseTables      = NULL;
     67     fSafeFwdTables      = NULL;
     68     fSafeRevTables      = NULL;
     69     fRuleStatusVals     = NULL;
     70     fChainRules         = FALSE;
     71     fLBCMNoChain        = FALSE;
     72     fLookAheadHardBreak = FALSE;
     73     fUSetNodes          = NULL;
     74     fRuleStatusVals     = NULL;
     75     fScanner            = NULL;
     76     fSetBuilder         = NULL;
     77     if (parseErr) {
     78         uprv_memset(parseErr, 0, sizeof(UParseError));
     79     }
     80 
     81     if (U_FAILURE(status)) {
     82         return;
     83     }
     84 
     85     fUSetNodes          = new UVector(status); // bcos status gets overwritten here
     86     fRuleStatusVals     = new UVector(status);
     87     fScanner            = new RBBIRuleScanner(this);
     88     fSetBuilder         = new RBBISetBuilder(this);
     89     if (U_FAILURE(status)) {
     90         return;
     91     }
     92     if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
     93         status = U_MEMORY_ALLOCATION_ERROR;
     94     }
     95 }
     96 
     97 
     98 
     99 //----------------------------------------------------------------------------------------
    100 //
    101 //  Destructor
    102 //
    103 //----------------------------------------------------------------------------------------
    104 RBBIRuleBuilder::~RBBIRuleBuilder() {
    105 
    106     int        i;
    107     for (i=0; ; i++) {
    108         RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
    109         if (n==NULL) {
    110             break;
    111         }
    112         delete n;
    113     }
    114 
    115     delete fUSetNodes;
    116     delete fSetBuilder;
    117     delete fForwardTables;
    118     delete fReverseTables;
    119     delete fSafeFwdTables;
    120     delete fSafeRevTables;
    121 
    122     delete fForwardTree;
    123     delete fReverseTree;
    124     delete fSafeFwdTree;
    125     delete fSafeRevTree;
    126     delete fScanner;
    127     delete fRuleStatusVals;
    128 }
    129 
    130 
    131 
    132 
    133 
    134 //----------------------------------------------------------------------------------------
    135 //
    136 //   flattenData() -  Collect up the compiled RBBI rule data and put it into
    137 //                    the format for saving in ICU data files,
    138 //                    which is also the format needed by the RBBI runtime engine.
    139 //
    140 //----------------------------------------------------------------------------------------
    141 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
    142 
    143 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
    144     int32_t    i;
    145 
    146     if (U_FAILURE(*fStatus)) {
    147         return NULL;
    148     }
    149 
    150     // Remove comments and whitespace from the rules to make it smaller.
    151     UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
    152 
    153     // Calculate the size of each section in the data.
    154     //   Sizes here are padded up to a multiple of 8 for better memory alignment.
    155     //   Sections sizes actually stored in the header are for the actual data
    156     //     without the padding.
    157     //
    158     int32_t headerSize        = align8(sizeof(RBBIDataHeader));
    159     int32_t forwardTableSize  = align8(fForwardTables->getTableSize());
    160     int32_t reverseTableSize  = align8(fReverseTables->getTableSize());
    161     int32_t safeFwdTableSize  = align8(fSafeFwdTables->getTableSize());
    162     int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());
    163     int32_t trieSize          = align8(fSetBuilder->getTrieSize());
    164     int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
    165     int32_t rulesSize         = align8((strippedRules.length()+1) * sizeof(UChar));
    166 
    167     (void)safeFwdTableSize;
    168 
    169     int32_t         totalSize = headerSize
    170                                 + forwardTableSize
    171                                 + /* reverseTableSize */ 0
    172                                 + /* safeFwdTableSize */ 0
    173                                 + (safeRevTableSize ? safeRevTableSize : reverseTableSize)
    174                                 + statusTableSize + trieSize + rulesSize;
    175 
    176     RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
    177     if (data == NULL) {
    178         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    179         return NULL;
    180     }
    181     uprv_memset(data, 0, totalSize);
    182 
    183 
    184     data->fMagic            = 0xb1a0;
    185     data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
    186     data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
    187     data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
    188     data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
    189     data->fLength           = totalSize;
    190     data->fCatCount         = fSetBuilder->getNumCharCategories();
    191 
    192     // Only save the forward table and the safe reverse table,
    193     // because these are the only ones used at run-time.
    194     //
    195     // For the moment, we still build the other tables if they are present in the rule source files,
    196     // for backwards compatibility. Old rule files need to work, and this is the simplest approach.
    197     //
    198     // Additional backwards compatibility consideration: if no safe rules are provided, consider the
    199     // reverse rules to actually be the safe reverse rules.
    200 
    201     data->fFTable        = headerSize;
    202     data->fFTableLen     = forwardTableSize;
    203 
    204     // Do not save Reverse Table.
    205     data->fRTable        = data->fFTable  + forwardTableSize;
    206     data->fRTableLen     = 0;
    207 
    208     // Do not save the Safe Forward table.
    209     data->fSFTable       = data->fRTable + 0;
    210     data->fSFTableLen    = 0;
    211 
    212     data->fSRTable       = data->fSFTable + 0;
    213     if (safeRevTableSize > 0) {
    214         data->fSRTableLen    = safeRevTableSize;
    215     } else if (reverseTableSize > 0) {
    216         data->fSRTableLen    = reverseTableSize;
    217     } else {
    218         U_ASSERT(FALSE);    // Rule build should have failed for lack of a reverse table
    219                             // before reaching this point.
    220     }
    221 
    222 
    223     data->fTrie          = data->fSRTable + data->fSRTableLen;
    224     data->fTrieLen       = fSetBuilder->getTrieSize();
    225     data->fStatusTable   = data->fTrie    + trieSize;
    226     data->fStatusTableLen= statusTableSize;
    227     data->fRuleSource    = data->fStatusTable + statusTableSize;
    228     data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
    229 
    230     uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
    231 
    232     fForwardTables->exportTable((uint8_t *)data + data->fFTable);
    233     // fReverseTables->exportTable((uint8_t *)data + data->fRTable);
    234     // fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
    235     if (safeRevTableSize > 0) {
    236         fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
    237     } else {
    238         fReverseTables->exportTable((uint8_t *)data + data->fSRTable);
    239     }
    240 
    241     fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
    242 
    243     int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
    244     for (i=0; i<fRuleStatusVals->size(); i++) {
    245         ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
    246     }
    247 
    248     strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
    249 
    250     return data;
    251 }
    252 
    253 
    254 
    255 
    256 
    257 
    258 //----------------------------------------------------------------------------------------
    259 //
    260 //  createRuleBasedBreakIterator    construct from source rules that are passed in
    261 //                                  in a UnicodeString
    262 //
    263 //----------------------------------------------------------------------------------------
    264 BreakIterator *
    265 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
    266                                     UParseError      *parseError,
    267                                     UErrorCode       &status)
    268 {
    269     // status checked below
    270 
    271     //
    272     // Read the input rules, generate a parse tree, symbol table,
    273     // and list of all Unicode Sets referenced by the rules.
    274     //
    275     RBBIRuleBuilder  builder(rules, parseError, status);
    276     if (U_FAILURE(status)) { // status checked here bcos build below doesn't
    277         return NULL;
    278     }
    279     builder.fScanner->parse();
    280 
    281     //
    282     // UnicodeSet processing.
    283     //    Munge the Unicode Sets to create a set of character categories.
    284     //    Generate the mapping tables (TRIE) from input 32-bit characters to
    285     //    the character categories.
    286     //
    287     builder.fSetBuilder->build();
    288 
    289 
    290     //
    291     //   Generate the DFA state transition table.
    292     //
    293     builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
    294     builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
    295     builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
    296     builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
    297     if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
    298         builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
    299     {
    300         status = U_MEMORY_ALLOCATION_ERROR;
    301         delete builder.fForwardTables; builder.fForwardTables = NULL;
    302         delete builder.fReverseTables; builder.fReverseTables = NULL;
    303         delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
    304         delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
    305         return NULL;
    306     }
    307 
    308     builder.fForwardTables->build();
    309     builder.fReverseTables->build();
    310     builder.fSafeFwdTables->build();
    311     builder.fSafeRevTables->build();
    312 
    313 #ifdef RBBI_DEBUG
    314     if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
    315         builder.fForwardTables->printRuleStatusTable();
    316     }
    317 #endif
    318 
    319     //
    320     //   Package up the compiled data into a memory image
    321     //      in the run-time format.
    322     //
    323     RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
    324     if (U_FAILURE(*builder.fStatus)) {
    325         return NULL;
    326     }
    327 
    328 
    329     //
    330     //  Clean up the compiler related stuff
    331     //
    332 
    333 
    334     //
    335     //  Create a break iterator from the compiled rules.
    336     //     (Identical to creation from stored pre-compiled rules)
    337     //
    338     // status is checked after init in construction.
    339     RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
    340     if (U_FAILURE(status)) {
    341         delete This;
    342         This = NULL;
    343     }
    344     else if(This == NULL) { // test for NULL
    345         status = U_MEMORY_ALLOCATION_ERROR;
    346     }
    347     return This;
    348 }
    349 
    350 U_NAMESPACE_END
    351 
    352 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    353