Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 1999-2007, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   Date        Name        Description
      7 *   11/17/99    aliu        Creation.
      8 **********************************************************************
      9 */
     10 #ifndef RBT_H
     11 #define RBT_H
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "unicode/translit.h"
     18 #include "unicode/utypes.h"
     19 #include "unicode/parseerr.h"
     20 #include "unicode/udata.h"
     21 
     22 #define U_ICUDATA_TRANSLIT U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "translit"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 class TransliterationRuleData;
     27 
     28 /**
     29  * <code>RuleBasedTransliterator</code> is a transliterator
     30  * that reads a set of rules in order to determine how to perform
     31  * translations. Rule sets are stored in resource bundles indexed by
     32  * name. Rules within a rule set are separated by semicolons (';').
     33  * To include a literal semicolon, prefix it with a backslash ('\').
     34  * Whitespace, as defined by <code>Character.isWhitespace()</code>,
     35  * is ignored. If the first non-blank character on a line is '#',
     36  * the entire line is ignored as a comment. </p>
     37  *
     38  * <p>Each set of rules consists of two groups, one forward, and one
     39  * reverse. This is a convention that is not enforced; rules for one
     40  * direction may be omitted, with the result that translations in
     41  * that direction will not modify the source text. In addition,
     42  * bidirectional forward-reverse rules may be specified for
     43  * symmetrical transformations.</p>
     44  *
     45  * <p><b>Rule syntax</b> </p>
     46  *
     47  * <p>Rule statements take one of the following forms: </p>
     48  *
     49  * <dl>
     50  *     <dt><code>$alefmadda=\u0622;</code></dt>
     51  *     <dd><strong>Variable definition.</strong> The name on the
     52  *         left is assigned the text on the right. In this example,
     53  *         after this statement, instances of the left hand name,
     54  *         &quot;<code>$alefmadda</code>&quot;, will be replaced by
     55  *         the Unicode character U+0622. Variable names must begin
     56  *         with a letter and consist only of letters, digits, and
     57  *         underscores. Case is significant. Duplicate names cause
     58  *         an exception to be thrown, that is, variables cannot be
     59  *         redefined. The right hand side may contain well-formed
     60  *         text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
     61  *         The right hand side may contain embedded <code>UnicodeSet</code>
     62  *         patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
     63  *     <dd>&nbsp;</dd>
     64  *     <dt><code>ai&gt;$alefmadda;</code></dt>
     65  *     <dd><strong>Forward translation rule.</strong> This rule
     66  *         states that the string on the left will be changed to the
     67  *         string on the right when performing forward
     68  *         transliteration.</dd>
     69  *     <dt>&nbsp;</dt>
     70  *     <dt><code>ai<$alefmadda;</code></dt>
     71  *     <dd><strong>Reverse translation rule.</strong> This rule
     72  *         states that the string on the right will be changed to
     73  *         the string on the left when performing reverse
     74  *         transliteration.</dd>
     75  * </dl>
     76  *
     77  * <dl>
     78  *     <dt><code>ai<>$alefmadda;</code></dt>
     79  *     <dd><strong>Bidirectional translation rule.</strong> This
     80  *         rule states that the string on the right will be changed
     81  *         to the string on the left when performing forward
     82  *         transliteration, and vice versa when performing reverse
     83  *         transliteration.</dd>
     84  * </dl>
     85  *
     86  * <p>Translation rules consist of a <em>match pattern</em> and an <em>output
     87  * string</em>. The match pattern consists of literal characters,
     88  * optionally preceded by context, and optionally followed by
     89  * context. Context characters, like literal pattern characters,
     90  * must be matched in the text being transliterated. However, unlike
     91  * literal pattern characters, they are not replaced by the output
     92  * text. For example, the pattern &quot;<code>abc{def}</code>&quot;
     93  * indicates the characters &quot;<code>def</code>&quot; must be
     94  * preceded by &quot;<code>abc</code>&quot; for a successful match.
     95  * If there is a successful match, &quot;<code>def</code>&quot; will
     96  * be replaced, but not &quot;<code>abc</code>&quot;. The final '<code>}</code>'
     97  * is optional, so &quot;<code>abc{def</code>&quot; is equivalent to
     98  * &quot;<code>abc{def}</code>&quot;. Another example is &quot;<code>{123}456</code>&quot;
     99  * (or &quot;<code>123}456</code>&quot;) in which the literal
    100  * pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.
    101  * </p>
    102  *
    103  * <p>The output string of a forward or reverse rule consists of
    104  * characters to replace the literal pattern characters. If the
    105  * output string contains the character '<code>|</code>', this is
    106  * taken to indicate the location of the <em>cursor</em> after
    107  * replacement. The cursor is the point in the text at which the
    108  * next replacement, if any, will be applied. The cursor is usually
    109  * placed within the replacement text; however, it can actually be
    110  * placed into the precending or following context by using the
    111  * special character '<code>@</code>'. Examples:</p>
    112  *
    113  * <blockquote>
    114  *     <p><code>a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor
    115  *     before a<br>
    116  *     {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between
    117  *     y and z</code></p>
    118  * </blockquote>
    119  *
    120  * <p><b>UnicodeSet</b></p>
    121  *
    122  * <p><code>UnicodeSet</code> patterns may appear anywhere that
    123  * makes sense. They may appear in variable definitions.
    124  * Contrariwise, <code>UnicodeSet</code> patterns may themselves
    125  * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
    126  * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.</p>
    127  *
    128  * <p><code>UnicodeSet</code> patterns may also be embedded directly
    129  * into rule strings. Thus, the following two rules are equivalent:</p>
    130  *
    131  * <blockquote>
    132  *     <p><code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this<br>
    133  *     [aeiou]&gt;'*';
    134  *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
    135  *     Another way</code></p>
    136  * </blockquote>
    137  *
    138  * <p>See {@link UnicodeSet} for more documentation and examples.</p>
    139  *
    140  * <p><b>Segments</b></p>
    141  *
    142  * <p>Segments of the input string can be matched and copied to the
    143  * output string. This makes certain sets of rules simpler and more
    144  * general, and makes reordering possible. For example:</p>
    145  *
    146  * <blockquote>
    147  *     <p><code>([a-z]) &gt; $1 $1;
    148  *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
    149  *     double lowercase letters<br>
    150  *     ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code></p>
    151  * </blockquote>
    152  *
    153  * <p>The segment of the input string to be copied is delimited by
    154  * &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to
    155  * nine segments may be defined. Segments may not overlap. In the
    156  * output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;
    157  * represent the input string segments, in left-to-right order of
    158  * definition.</p>
    159  *
    160  * <p><b>Anchors</b></p>
    161  *
    162  * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
    163  * special characters '<code>^</code>' and '<code>$</code>'. For example:</p>
    164  *
    165  * <blockquote>
    166  *   <p><code>^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text<br>
    167  *   &nbsp; a&nbsp;&nbsp; &gt; 'A';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
    168  *   of 'a'<br>
    169  *   &nbsp; z $ &gt; 'END_Z'; &nbsp;&nbsp;# match 'z' at end of text<br>
    170  *   &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
    171  *   of 'z'</code></p>
    172  * </blockquote>
    173  *
    174  * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
    175  * This is done by including a virtual anchor character '<code>$</code>' at the end of the
    176  * set pattern. Although this is usually the match chafacter for the end anchor, the set will
    177  * match either the beginning or the end of the text, depending on its placement. For
    178  * example:</p>
    179  *
    180  * <blockquote>
    181  *   <p><code>$x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor<br>
    182  *   $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start<br>
    183  *   &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end</code></p>
    184  * </blockquote>
    185  *
    186  * <p><b>Example</b> </p>
    187  *
    188  * <p>The following example rules illustrate many of the features of
    189  * the rule language. </p>
    190  *
    191  * <table border="0" cellpadding="4">
    192  *     <tr>
    193  *         <td valign="top">Rule 1.</td>
    194  *         <td valign="top" nowrap><code>abc{def}&gt;x|y</code></td>
    195  *     </tr>
    196  *     <tr>
    197  *         <td valign="top">Rule 2.</td>
    198  *         <td valign="top" nowrap><code>xyz&gt;r</code></td>
    199  *     </tr>
    200  *     <tr>
    201  *         <td valign="top">Rule 3.</td>
    202  *         <td valign="top" nowrap><code>yz&gt;q</code></td>
    203  *     </tr>
    204  * </table>
    205  *
    206  * <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;
    207  * yields the following results: </p>
    208  *
    209  * <table border="0" cellpadding="4">
    210  *     <tr>
    211  *         <td valign="top" nowrap><code>|adefabcdefz</code></td>
    212  *         <td valign="top">Initial state, no rules match. Advance
    213  *         cursor.</td>
    214  *     </tr>
    215  *     <tr>
    216  *         <td valign="top" nowrap><code>a|defabcdefz</code></td>
    217  *         <td valign="top">Still no match. Rule 1 does not match
    218  *         because the preceding context is not present.</td>
    219  *     </tr>
    220  *     <tr>
    221  *         <td valign="top" nowrap><code>ad|efabcdefz</code></td>
    222  *         <td valign="top">Still no match. Keep advancing until
    223  *         there is a match...</td>
    224  *     </tr>
    225  *     <tr>
    226  *         <td valign="top" nowrap><code>ade|fabcdefz</code></td>
    227  *         <td valign="top">...</td>
    228  *     </tr>
    229  *     <tr>
    230  *         <td valign="top" nowrap><code>adef|abcdefz</code></td>
    231  *         <td valign="top">...</td>
    232  *     </tr>
    233  *     <tr>
    234  *         <td valign="top" nowrap><code>adefa|bcdefz</code></td>
    235  *         <td valign="top">...</td>
    236  *     </tr>
    237  *     <tr>
    238  *         <td valign="top" nowrap><code>adefab|cdefz</code></td>
    239  *         <td valign="top">...</td>
    240  *     </tr>
    241  *     <tr>
    242  *         <td valign="top" nowrap><code>adefabc|defz</code></td>
    243  *         <td valign="top">Rule 1 matches; replace &quot;<code>def</code>&quot;
    244  *         with &quot;<code>xy</code>&quot; and back up the cursor
    245  *         to before the '<code>y</code>'.</td>
    246  *     </tr>
    247  *     <tr>
    248  *         <td valign="top" nowrap><code>adefabcx|yz</code></td>
    249  *         <td valign="top">Although &quot;<code>xyz</code>&quot; is
    250  *         present, rule 2 does not match because the cursor is
    251  *         before the '<code>y</code>', not before the '<code>x</code>'.
    252  *         Rule 3 does match. Replace &quot;<code>yz</code>&quot;
    253  *         with &quot;<code>q</code>&quot;.</td>
    254  *     </tr>
    255  *     <tr>
    256  *         <td valign="top" nowrap><code>adefabcxq|</code></td>
    257  *         <td valign="top">The cursor is at the end;
    258  *         transliteration is complete.</td>
    259  *     </tr>
    260  * </table>
    261  *
    262  * <p>The order of rules is significant. If multiple rules may match
    263  * at some point, the first matching rule is applied. </p>
    264  *
    265  * <p>Forward and reverse rules may have an empty output string.
    266  * Otherwise, an empty left or right hand side of any statement is a
    267  * syntax error. </p>
    268  *
    269  * <p>Single quotes are used to quote any character other than a
    270  * digit or letter. To specify a single quote itself, inside or
    271  * outside of quotes, use two single quotes in a row. For example,
    272  * the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the
    273  * string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.
    274  * </p>
    275  *
    276  * <p><b>Notes</b> </p>
    277  *
    278  * <p>While a RuleBasedTransliterator is being built, it checks that
    279  * the rules are added in proper order. For example, if the rule
    280  * &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,
    281  * then the second rule will throw an exception. The reason is that
    282  * the second rule can never be triggered, since the first rule
    283  * always matches anything it matches. In other words, the first
    284  * rule <em>masks</em> the second rule. </p>
    285  *
    286  * @author Alan Liu
    287  * @internal Use transliterator factory methods instead since this class will be removed in that release.
    288  */
    289 class RuleBasedTransliterator : public Transliterator {
    290 private:
    291     /**
    292      * The data object is immutable, so we can freely share it with
    293      * other instances of RBT, as long as we do NOT own this object.
    294      *  TODO:  data is no longer immutable.  See bugs #1866, 2155
    295      */
    296     TransliterationRuleData* fData;
    297 
    298     /**
    299      * If true, we own the data object and must delete it.
    300      */
    301     UBool isDataOwned;
    302 
    303 public:
    304 
    305     /**
    306      * Constructs a new transliterator from the given rules.
    307      * @param rules rules, separated by ';'
    308      * @param direction either FORWARD or REVERSE.
    309      * @exception IllegalArgumentException if rules are malformed.
    310      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    311      */
    312     RuleBasedTransliterator(const UnicodeString& id,
    313                             const UnicodeString& rules,
    314                             UTransDirection direction,
    315                             UnicodeFilter* adoptedFilter,
    316                             UParseError& parseError,
    317                             UErrorCode& status);
    318 
    319     /**
    320      * Constructs a new transliterator from the given rules.
    321      * @param rules rules, separated by ';'
    322      * @param direction either FORWARD or REVERSE.
    323      * @exception IllegalArgumentException if rules are malformed.
    324      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    325      */
    326     /*RuleBasedTransliterator(const UnicodeString& id,
    327                             const UnicodeString& rules,
    328                             UTransDirection direction,
    329                             UnicodeFilter* adoptedFilter,
    330                             UErrorCode& status);*/
    331 
    332     /**
    333      * Covenience constructor with no filter.
    334      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    335      */
    336     /*RuleBasedTransliterator(const UnicodeString& id,
    337                             const UnicodeString& rules,
    338                             UTransDirection direction,
    339                             UErrorCode& status);*/
    340 
    341     /**
    342      * Covenience constructor with no filter and FORWARD direction.
    343      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    344      */
    345     /*RuleBasedTransliterator(const UnicodeString& id,
    346                             const UnicodeString& rules,
    347                             UErrorCode& status);*/
    348 
    349     /**
    350      * Covenience constructor with FORWARD direction.
    351      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    352      */
    353     /*RuleBasedTransliterator(const UnicodeString& id,
    354                             const UnicodeString& rules,
    355                             UnicodeFilter* adoptedFilter,
    356                             UErrorCode& status);*/
    357 private:
    358 
    359      friend class TransliteratorRegistry; // to access TransliterationRuleData convenience ctor
    360     /**
    361      * Covenience constructor.
    362      * @param id            the id for the transliterator.
    363      * @param theData       the rule data for the transliterator.
    364      * @param adoptedFilter the filter for the transliterator
    365      */
    366     RuleBasedTransliterator(const UnicodeString& id,
    367                             const TransliterationRuleData* theData,
    368                             UnicodeFilter* adoptedFilter = 0);
    369 
    370 
    371     friend class Transliterator; // to access following ct
    372 
    373     /**
    374      * Internal constructor.
    375      * @param id            the id for the transliterator.
    376      * @param theData       the rule data for the transliterator.
    377      * @param isDataAdopted determine who will own the 'data' object. True, the caller should not delete 'data'.
    378      */
    379     RuleBasedTransliterator(const UnicodeString& id,
    380                             TransliterationRuleData* data,
    381                             UBool isDataAdopted);
    382 
    383 public:
    384 
    385     /**
    386      * Copy constructor.
    387      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    388      */
    389     RuleBasedTransliterator(const RuleBasedTransliterator&);
    390 
    391     virtual ~RuleBasedTransliterator();
    392 
    393     /**
    394      * Implement Transliterator API.
    395      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    396      */
    397     virtual Transliterator* clone(void) const;
    398 
    399 protected:
    400     /**
    401      * Implements {@link Transliterator#handleTransliterate}.
    402      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    403      */
    404     virtual void handleTransliterate(Replaceable& text, UTransPosition& offsets,
    405                                      UBool isIncremental) const;
    406 
    407 public:
    408     /**
    409      * Return a representation of this transliterator as source rules.
    410      * These rules will produce an equivalent transliterator if used
    411      * to construct a new transliterator.
    412      * @param result the string to receive the rules.  Previous
    413      * contents will be deleted.
    414      * @param escapeUnprintable if TRUE then convert unprintable
    415      * character to their hex escape representations, \uxxxx or
    416      * \Uxxxxxxxx.  Unprintable characters are those other than
    417      * U+000A, U+0020..U+007E.
    418      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    419      */
    420     virtual UnicodeString& toRules(UnicodeString& result,
    421                                    UBool escapeUnprintable) const;
    422 
    423 protected:
    424     /**
    425      * Implement Transliterator framework
    426      */
    427     virtual void handleGetSourceSet(UnicodeSet& result) const;
    428 
    429 public:
    430     /**
    431      * Override Transliterator framework
    432      */
    433     virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;
    434 
    435     /**
    436      * Return the class ID for this class.  This is useful only for
    437      * comparing to a return value from getDynamicClassID().  For example:
    438      * <pre>
    439      * .      Base* polymorphic_pointer = createPolymorphicObject();
    440      * .      if (polymorphic_pointer->getDynamicClassID() ==
    441      * .          Derived::getStaticClassID()) ...
    442      * </pre>
    443      * @return          The class ID for all objects of this class.
    444      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    445      */
    446     U_I18N_API static UClassID U_EXPORT2 getStaticClassID(void);
    447 
    448     /**
    449      * Returns a unique class ID <b>polymorphically</b>.  This method
    450      * is to implement a simple version of RTTI, since not all C++
    451      * compilers support genuine RTTI.  Polymorphic operator==() and
    452      * clone() methods call this method.
    453      *
    454      * @return The class ID for this object. All objects of a given
    455      * class have the same class ID.  Objects of other classes have
    456      * different class IDs.
    457      */
    458     virtual UClassID getDynamicClassID(void) const;
    459 
    460 private:
    461 
    462     void _construct(const UnicodeString& rules,
    463                     UTransDirection direction,
    464                     UParseError& parseError,
    465                     UErrorCode& status);
    466 };
    467 
    468 
    469 U_NAMESPACE_END
    470 
    471 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    472 
    473 #endif
    474