Home | History | Annotate | Download | only in common
      1 //
      2 //  file:  rbbirb.cpp
      3 //
      4 //  Copyright (C) 2002-2011, International Business Machines Corporation and others.
      5 //  All Rights Reserved.
      6 //
      7 //  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
      8 //    building (compiling) break rules into the tables required by the runtime
      9 //    RBBI engine.
     10 //
     11 
     12 #include "unicode/utypes.h"
     13 
     14 #if !UCONFIG_NO_BREAK_ITERATION
     15 
     16 #include "unicode/brkiter.h"
     17 #include "unicode/rbbi.h"
     18 #include "unicode/ubrk.h"
     19 #include "unicode/unistr.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/uchriter.h"
     23 #include "unicode/parsepos.h"
     24 #include "unicode/parseerr.h"
     25 #include "cmemory.h"
     26 #include "cstring.h"
     27 
     28 #include "rbbirb.h"
     29 #include "rbbinode.h"
     30 
     31 #include "rbbiscan.h"
     32 #include "rbbisetb.h"
     33 #include "rbbitblb.h"
     34 #include "rbbidata.h"
     35 
     36 
     37 U_NAMESPACE_BEGIN
     38 
     39 
     40 //----------------------------------------------------------------------------------------
     41 //
     42 //  Constructor.
     43 //
     44 //----------------------------------------------------------------------------------------
     45 RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
     46                                        UParseError     *parseErr,
     47                                        UErrorCode      &status)
     48  : fRules(rules)
     49 {
     50     fStatus = &status; // status is checked below
     51     fParseError = parseErr;
     52     fDebugEnv   = NULL;
     53 #ifdef RBBI_DEBUG
     54     fDebugEnv   = getenv("U_RBBIDEBUG");
     55 #endif
     56 
     57 
     58     fForwardTree        = NULL;
     59     fReverseTree        = NULL;
     60     fSafeFwdTree        = NULL;
     61     fSafeRevTree        = NULL;
     62     fDefaultTree        = &fForwardTree;
     63     fForwardTables      = NULL;
     64     fReverseTables      = NULL;
     65     fSafeFwdTables      = NULL;
     66     fSafeRevTables      = NULL;
     67     fRuleStatusVals     = NULL;
     68     fChainRules         = FALSE;
     69     fLBCMNoChain        = FALSE;
     70     fLookAheadHardBreak = FALSE;
     71     fUSetNodes          = NULL;
     72     fRuleStatusVals     = NULL;
     73     fScanner            = NULL;
     74     fSetBuilder         = NULL;
     75     if (parseErr) {
     76         uprv_memset(parseErr, 0, sizeof(UParseError));
     77     }
     78 
     79     if (U_FAILURE(status)) {
     80         return;
     81     }
     82 
     83     fUSetNodes          = new UVector(status); // bcos status gets overwritten here
     84     fRuleStatusVals     = new UVector(status);
     85     fScanner            = new RBBIRuleScanner(this);
     86     fSetBuilder         = new RBBISetBuilder(this);
     87     if (U_FAILURE(status)) {
     88         return;
     89     }
     90     if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
     91         status = U_MEMORY_ALLOCATION_ERROR;
     92     }
     93 }
     94 
     95 
     96 
     97 //----------------------------------------------------------------------------------------
     98 //
     99 //  Destructor
    100 //
    101 //----------------------------------------------------------------------------------------
    102 RBBIRuleBuilder::~RBBIRuleBuilder() {
    103 
    104     int        i;
    105     for (i=0; ; i++) {
    106         RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
    107         if (n==NULL) {
    108             break;
    109         }
    110         delete n;
    111     }
    112 
    113     delete fUSetNodes;
    114     delete fSetBuilder;
    115     delete fForwardTables;
    116     delete fReverseTables;
    117     delete fSafeFwdTables;
    118     delete fSafeRevTables;
    119 
    120     delete fForwardTree;
    121     delete fReverseTree;
    122     delete fSafeFwdTree;
    123     delete fSafeRevTree;
    124     delete fScanner;
    125     delete fRuleStatusVals;
    126 }
    127 
    128 
    129 
    130 
    131 
    132 //----------------------------------------------------------------------------------------
    133 //
    134 //   flattenData() -  Collect up the compiled RBBI rule data and put it into
    135 //                    the format for saving in ICU data files,
    136 //                    which is also the format needed by the RBBI runtime engine.
    137 //
    138 //----------------------------------------------------------------------------------------
    139 static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
    140 
    141 RBBIDataHeader *RBBIRuleBuilder::flattenData() {
    142     int32_t    i;
    143 
    144     if (U_FAILURE(*fStatus)) {
    145         return NULL;
    146     }
    147 
    148     // Remove comments and whitespace from the rules to make it smaller.
    149     UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
    150 
    151     // Calculate the size of each section in the data.
    152     //   Sizes here are padded up to a multiple of 8 for better memory alignment.
    153     //   Sections sizes actually stored in the header are for the actual data
    154     //     without the padding.
    155     //
    156     int32_t headerSize        = align8(sizeof(RBBIDataHeader));
    157     int32_t forwardTableSize  = align8(fForwardTables->getTableSize());
    158     int32_t reverseTableSize  = align8(fReverseTables->getTableSize());
    159     int32_t safeFwdTableSize  = align8(fSafeFwdTables->getTableSize());
    160     int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());
    161     int32_t trieSize          = align8(fSetBuilder->getTrieSize());
    162     int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
    163     int32_t rulesSize         = align8((strippedRules.length()+1) * sizeof(UChar));
    164 
    165     int32_t         totalSize = headerSize + forwardTableSize + reverseTableSize
    166                                 + safeFwdTableSize + safeRevTableSize
    167                                 + statusTableSize + trieSize + rulesSize;
    168 
    169     RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
    170     if (data == NULL) {
    171         *fStatus = U_MEMORY_ALLOCATION_ERROR;
    172         return NULL;
    173     }
    174     uprv_memset(data, 0, totalSize);
    175 
    176 
    177     data->fMagic            = 0xb1a0;
    178     data->fFormatVersion[0] = 3;
    179     data->fFormatVersion[1] = 1;
    180     data->fFormatVersion[2] = 0;
    181     data->fFormatVersion[3] = 0;
    182     data->fLength           = totalSize;
    183     data->fCatCount         = fSetBuilder->getNumCharCategories();
    184 
    185     data->fFTable        = headerSize;
    186     data->fFTableLen     = forwardTableSize;
    187     data->fRTable        = data->fFTable  + forwardTableSize;
    188     data->fRTableLen     = reverseTableSize;
    189     data->fSFTable       = data->fRTable  + reverseTableSize;
    190     data->fSFTableLen    = safeFwdTableSize;
    191     data->fSRTable       = data->fSFTable + safeFwdTableSize;
    192     data->fSRTableLen    = safeRevTableSize;
    193 
    194     data->fTrie          = data->fSRTable + safeRevTableSize;
    195     data->fTrieLen       = fSetBuilder->getTrieSize();
    196     data->fStatusTable   = data->fTrie    + trieSize;
    197     data->fStatusTableLen= statusTableSize;
    198     data->fRuleSource    = data->fStatusTable + statusTableSize;
    199     data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
    200 
    201     uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
    202 
    203     fForwardTables->exportTable((uint8_t *)data + data->fFTable);
    204     fReverseTables->exportTable((uint8_t *)data + data->fRTable);
    205     fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
    206     fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
    207     fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
    208 
    209     int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
    210     for (i=0; i<fRuleStatusVals->size(); i++) {
    211         ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
    212     }
    213 
    214     strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
    215 
    216     return data;
    217 }
    218 
    219 
    220 
    221 
    222 
    223 
    224 //----------------------------------------------------------------------------------------
    225 //
    226 //  createRuleBasedBreakIterator    construct from source rules that are passed in
    227 //                                  in a UnicodeString
    228 //
    229 //----------------------------------------------------------------------------------------
    230 BreakIterator *
    231 RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
    232                                     UParseError      *parseError,
    233                                     UErrorCode       &status)
    234 {
    235     // status checked below
    236 
    237     //
    238     // Read the input rules, generate a parse tree, symbol table,
    239     // and list of all Unicode Sets referenced by the rules.
    240     //
    241     RBBIRuleBuilder  builder(rules, parseError, status);
    242     if (U_FAILURE(status)) { // status checked here bcos build below doesn't
    243         return NULL;
    244     }
    245     builder.fScanner->parse();
    246 
    247     //
    248     // UnicodeSet processing.
    249     //    Munge the Unicode Sets to create a set of character categories.
    250     //    Generate the mapping tables (TRIE) from input 32-bit characters to
    251     //    the character categories.
    252     //
    253     builder.fSetBuilder->build();
    254 
    255 
    256     //
    257     //   Generate the DFA state transition table.
    258     //
    259     builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
    260     builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
    261     builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
    262     builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
    263     if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
    264         builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
    265     {
    266         status = U_MEMORY_ALLOCATION_ERROR;
    267         delete builder.fForwardTables; builder.fForwardTables = NULL;
    268         delete builder.fReverseTables; builder.fReverseTables = NULL;
    269         delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
    270         delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
    271         return NULL;
    272     }
    273 
    274     builder.fForwardTables->build();
    275     builder.fReverseTables->build();
    276     builder.fSafeFwdTables->build();
    277     builder.fSafeRevTables->build();
    278 
    279 #ifdef RBBI_DEBUG
    280     if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
    281         builder.fForwardTables->printRuleStatusTable();
    282     }
    283 #endif
    284 
    285     //
    286     //   Package up the compiled data into a memory image
    287     //      in the run-time format.
    288     //
    289     RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
    290     if (U_FAILURE(*builder.fStatus)) {
    291         return NULL;
    292     }
    293 
    294 
    295     //
    296     //  Clean up the compiler related stuff
    297     //
    298 
    299 
    300     //
    301     //  Create a break iterator from the compiled rules.
    302     //     (Identical to creation from stored pre-compiled rules)
    303     //
    304     // status is checked after init in construction.
    305     RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
    306     if (U_FAILURE(status)) {
    307         delete This;
    308         This = NULL;
    309     }
    310     else if(This == NULL) { // test for NULL
    311         status = U_MEMORY_ALLOCATION_ERROR;
    312     }
    313     return This;
    314 }
    315 
    316 U_NAMESPACE_END
    317 
    318 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
    319