Home | History | Annotate | Download | only in common
      1 //
      2 //  rbbiscan.h
      3 //
      4 //  Copyright (C) 2002-2008, International Business Machines Corporation and others.
      5 //  All Rights Reserved.
      6 //
      7 //  This file contains declarations for class RBBIRuleScanner
      8 //
      9 
     10 
     11 #ifndef RBBISCAN_H
     12 #define RBBISCAN_H
     13 
     14 #include "unicode/utypes.h"
     15 #include "unicode/uobject.h"
     16 #include "unicode/rbbi.h"
     17 #include "unicode/uniset.h"
     18 #include "unicode/parseerr.h"
     19 #include "uhash.h"
     20 #include "uvector.h"
     21 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
     22                           //    looks up references to $variables within a set.
     23 #include "rbbinode.h"
     24 //#include "rbbitblb.h"
     25 
     26 
     27 
     28 U_NAMESPACE_BEGIN
     29 
     30 class   RBBIRuleBuilder;
     31 class   RBBISymbolTable;
     32 
     33 
     34 //--------------------------------------------------------------------------------
     35 //
     36 //  class RBBIRuleScanner does the lowest level, character-at-a-time
     37 //                        scanning of break iterator rules.
     38 //
     39 //                        The output of the scanner is parse trees for
     40 //                        the rule expressions and a list of all Unicode Sets
     41 //                        encountered.
     42 //
     43 //--------------------------------------------------------------------------------
     44 
     45 class RBBIRuleScanner : public UMemory {
     46 public:
     47 
     48     enum {
     49         kStackSize = 100            // The size of the state stack for
     50     };                              //   rules parsing.  Corresponds roughly
     51                                     //   to the depth of parentheses nesting
     52                                     //   that is allowed in the rules.
     53 
     54     struct RBBIRuleChar {
     55         UChar32             fChar;
     56         UBool               fEscaped;
     57     };
     58 
     59     RBBIRuleScanner(RBBIRuleBuilder  *rb);
     60 
     61 
     62     virtual    ~RBBIRuleScanner();
     63 
     64     void        nextChar(RBBIRuleChar &c);          // Get the next char from the input stream.
     65                                                     // Return false if at end.
     66 
     67     UBool       push(const RBBIRuleChar &c);        // Push (unget) one character.
     68                                                     //   Only a single character may be pushed.
     69 
     70     void        parse();                            // Parse the rules, generating two parse
     71                                                     //   trees, one each for the forward and
     72                                                     //   reverse rules,
     73                                                     //   and a list of UnicodeSets encountered.
     74 
     75     /**
     76      * Return a rules string without unnecessary
     77      * characters.
     78      */
     79     static UnicodeString stripRules(const UnicodeString &rules);
     80 private:
     81 
     82     UBool       doParseActions(int32_t a);
     83     void        error(UErrorCode e);                   // error reporting convenience function.
     84     void        fixOpStack(RBBINode::OpPrecedence p);
     85                                                        //   a character.
     86     void        findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
     87 
     88     UChar32     nextCharLL();
     89 #ifdef RBBI_DEBUG
     90     void        printNodeStack(const char *title);
     91 #endif
     92     RBBINode    *pushNewNode(RBBINode::NodeType  t);
     93     void        scanSet();
     94 
     95 
     96     RBBIRuleBuilder               *fRB;              // The rule builder that we are part of.
     97 
     98     int32_t                       fScanIndex;        // Index of current character being processed
     99                                                      //   in the rule input string.
    100     int32_t                       fNextIndex;        // Index of the next character, which
    101                                                      //   is the first character not yet scanned.
    102     UBool                         fQuoteMode;        // Scan is in a 'quoted region'
    103     int32_t                       fLineNum;          // Line number in input file.
    104     int32_t                       fCharNum;          // Char position within the line.
    105     UChar32                       fLastChar;         // Previous char, needed to count CR-LF
    106                                                      //   as a single line, not two.
    107 
    108     RBBIRuleChar                  fC;                // Current char for parse state machine
    109                                                      //   processing.
    110     UnicodeString                 fVarName;          // $variableName, valid when we've just
    111                                                      //   scanned one.
    112 
    113     RBBIRuleTableEl               **fStateTable;     // State Transition Table for RBBI Rule
    114                                                      //   parsing.  index by p[state][char-class]
    115 
    116     uint16_t                      fStack[kStackSize];  // State stack, holds state pushes
    117     int32_t                       fStackPtr;           //  and pops as specified in the state
    118                                                        //  transition rules.
    119 
    120     RBBINode                      *fNodeStack[kStackSize]; // Node stack, holds nodes created
    121                                                            //  during the parse of a rule
    122     int32_t                        fNodeStackPtr;
    123 
    124 
    125     UBool                          fReverseRule;     // True if the rule currently being scanned
    126                                                      //  is a reverse direction rule (if it
    127                                                      //  starts with a '!')
    128 
    129     UBool                          fLookAheadRule;   // True if the rule includes a '/'
    130                                                      //   somewhere within it.
    131 
    132     RBBISymbolTable               *fSymbolTable;     // symbol table, holds definitions of
    133                                                      //   $variable symbols.
    134 
    135     UHashtable                    *fSetTable;        // UnicocodeSet hash table, holds indexes to
    136                                                      //   the sets created while parsing rules.
    137                                                      //   The key is the string used for creating
    138                                                      //   the set.
    139 
    140     UnicodeSet                     fRuleSets[10];    // Unicode Sets that are needed during
    141                                                      //  the scanning of RBBI rules.  The
    142                                                      //  indicies for these are assigned by the
    143                                                      //  perl script that builds the state tables.
    144                                                      //  See rbbirpt.h.
    145 
    146     int32_t                        fRuleNum;         // Counts each rule as it is scanned.
    147 
    148     int32_t                        fOptionStart;     // Input index of start of a !!option
    149                                                      //   keyword, while being scanned.
    150 
    151     UnicodeSet *gRuleSet_rule_char;
    152     UnicodeSet *gRuleSet_white_space;
    153     UnicodeSet *gRuleSet_name_char;
    154     UnicodeSet *gRuleSet_name_start_char;
    155 
    156     RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
    157     RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
    158 };
    159 
    160 U_NAMESPACE_END
    161 
    162 #endif
    163