Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 1999-2007, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   11/17/99    aliu        Creation.
     10 **********************************************************************
     11 */
     12 #ifndef RBT_H
     13 #define RBT_H
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_TRANSLITERATION
     18 
     19 #include "unicode/translit.h"
     20 #include "unicode/utypes.h"
     21 #include "unicode/parseerr.h"
     22 #include "unicode/udata.h"
     23 
     24 #define U_ICUDATA_TRANSLIT U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "translit"
     25 
     26 U_NAMESPACE_BEGIN
     27 
     28 class TransliterationRuleData;
     29 
     30 /**
     31  * <code>RuleBasedTransliterator</code> is a transliterator
     32  * that reads a set of rules in order to determine how to perform
     33  * translations. Rule sets are stored in resource bundles indexed by
     34  * name. Rules within a rule set are separated by semicolons (';').
     35  * To include a literal semicolon, prefix it with a backslash ('\').
     36  * Whitespace, as defined by <code>Character.isWhitespace()</code>,
     37  * is ignored. If the first non-blank character on a line is '#',
     38  * the entire line is ignored as a comment. </p>
     39  *
     40  * <p>Each set of rules consists of two groups, one forward, and one
     41  * reverse. This is a convention that is not enforced; rules for one
     42  * direction may be omitted, with the result that translations in
     43  * that direction will not modify the source text. In addition,
     44  * bidirectional forward-reverse rules may be specified for
     45  * symmetrical transformations.</p>
     46  *
     47  * <p><b>Rule syntax</b> </p>
     48  *
     49  * <p>Rule statements take one of the following forms: </p>
     50  *
     51  * <dl>
     52  *     <dt><code>$alefmadda=\u0622;</code></dt>
     53  *     <dd><strong>Variable definition.</strong> The name on the
     54  *         left is assigned the text on the right. In this example,
     55  *         after this statement, instances of the left hand name,
     56  *         &quot;<code>$alefmadda</code>&quot;, will be replaced by
     57  *         the Unicode character U+0622. Variable names must begin
     58  *         with a letter and consist only of letters, digits, and
     59  *         underscores. Case is significant. Duplicate names cause
     60  *         an exception to be thrown, that is, variables cannot be
     61  *         redefined. The right hand side may contain well-formed
     62  *         text of any length, including no text at all (&quot;<code>$empty=;</code>&quot;).
     63  *         The right hand side may contain embedded <code>UnicodeSet</code>
     64  *         patterns, for example, &quot;<code>$softvowel=[eiyEIY]</code>&quot;.</dd>
     65  *     <dd>&nbsp;</dd>
     66  *     <dt><code>ai&gt;$alefmadda;</code></dt>
     67  *     <dd><strong>Forward translation rule.</strong> This rule
     68  *         states that the string on the left will be changed to the
     69  *         string on the right when performing forward
     70  *         transliteration.</dd>
     71  *     <dt>&nbsp;</dt>
     72  *     <dt><code>ai<$alefmadda;</code></dt>
     73  *     <dd><strong>Reverse translation rule.</strong> This rule
     74  *         states that the string on the right will be changed to
     75  *         the string on the left when performing reverse
     76  *         transliteration.</dd>
     77  * </dl>
     78  *
     79  * <dl>
     80  *     <dt><code>ai<>$alefmadda;</code></dt>
     81  *     <dd><strong>Bidirectional translation rule.</strong> This
     82  *         rule states that the string on the right will be changed
     83  *         to the string on the left when performing forward
     84  *         transliteration, and vice versa when performing reverse
     85  *         transliteration.</dd>
     86  * </dl>
     87  *
     88  * <p>Translation rules consist of a <em>match pattern</em> and an <em>output
     89  * string</em>. The match pattern consists of literal characters,
     90  * optionally preceded by context, and optionally followed by
     91  * context. Context characters, like literal pattern characters,
     92  * must be matched in the text being transliterated. However, unlike
     93  * literal pattern characters, they are not replaced by the output
     94  * text. For example, the pattern &quot;<code>abc{def}</code>&quot;
     95  * indicates the characters &quot;<code>def</code>&quot; must be
     96  * preceded by &quot;<code>abc</code>&quot; for a successful match.
     97  * If there is a successful match, &quot;<code>def</code>&quot; will
     98  * be replaced, but not &quot;<code>abc</code>&quot;. The final '<code>}</code>'
     99  * is optional, so &quot;<code>abc{def</code>&quot; is equivalent to
    100  * &quot;<code>abc{def}</code>&quot;. Another example is &quot;<code>{123}456</code>&quot;
    101  * (or &quot;<code>123}456</code>&quot;) in which the literal
    102  * pattern &quot;<code>123</code>&quot; must be followed by &quot;<code>456</code>&quot;.
    103  * </p>
    104  *
    105  * <p>The output string of a forward or reverse rule consists of
    106  * characters to replace the literal pattern characters. If the
    107  * output string contains the character '<code>|</code>', this is
    108  * taken to indicate the location of the <em>cursor</em> after
    109  * replacement. The cursor is the point in the text at which the
    110  * next replacement, if any, will be applied. The cursor is usually
    111  * placed within the replacement text; however, it can actually be
    112  * placed into the precending or following context by using the
    113  * special character '<code>@</code>'. Examples:</p>
    114  *
    115  * <blockquote>
    116  *     <p><code>a {foo} z &gt; | @ bar; # foo -&gt; bar, move cursor
    117  *     before a<br>
    118  *     {foo} xyz &gt; bar @@|; #&nbsp;foo -&gt; bar, cursor between
    119  *     y and z</code></p>
    120  * </blockquote>
    121  *
    122  * <p><b>UnicodeSet</b></p>
    123  *
    124  * <p><code>UnicodeSet</code> patterns may appear anywhere that
    125  * makes sense. They may appear in variable definitions.
    126  * Contrariwise, <code>UnicodeSet</code> patterns may themselves
    127  * contain variable references, such as &quot;<code>$a=[a-z];$not_a=[^$a]</code>&quot;,
    128  * or &quot;<code>$range=a-z;$ll=[$range]</code>&quot;.</p>
    129  *
    130  * <p><code>UnicodeSet</code> patterns may also be embedded directly
    131  * into rule strings. Thus, the following two rules are equivalent:</p>
    132  *
    133  * <blockquote>
    134  *     <p><code>$vowel=[aeiou]; $vowel&gt;'*'; # One way to do this<br>
    135  *     [aeiou]&gt;'*';
    136  *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
    137  *     Another way</code></p>
    138  * </blockquote>
    139  *
    140  * <p>See {@link UnicodeSet} for more documentation and examples.</p>
    141  *
    142  * <p><b>Segments</b></p>
    143  *
    144  * <p>Segments of the input string can be matched and copied to the
    145  * output string. This makes certain sets of rules simpler and more
    146  * general, and makes reordering possible. For example:</p>
    147  *
    148  * <blockquote>
    149  *     <p><code>([a-z]) &gt; $1 $1;
    150  *     &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
    151  *     double lowercase letters<br>
    152  *     ([:Lu:]) ([:Ll:]) &gt; $2 $1; # reverse order of Lu-Ll pairs</code></p>
    153  * </blockquote>
    154  *
    155  * <p>The segment of the input string to be copied is delimited by
    156  * &quot;<code>(</code>&quot; and &quot;<code>)</code>&quot;. Up to
    157  * nine segments may be defined. Segments may not overlap. In the
    158  * output string, &quot;<code>$1</code>&quot; through &quot;<code>$9</code>&quot;
    159  * represent the input string segments, in left-to-right order of
    160  * definition.</p>
    161  *
    162  * <p><b>Anchors</b></p>
    163  *
    164  * <p>Patterns can be anchored to the beginning or the end of the text. This is done with the
    165  * special characters '<code>^</code>' and '<code>$</code>'. For example:</p>
    166  *
    167  * <blockquote>
    168  *   <p><code>^ a&nbsp;&nbsp; &gt; 'BEG_A'; &nbsp;&nbsp;# match 'a' at start of text<br>
    169  *   &nbsp; a&nbsp;&nbsp; &gt; 'A';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
    170  *   of 'a'<br>
    171  *   &nbsp; z $ &gt; 'END_Z'; &nbsp;&nbsp;# match 'z' at end of text<br>
    172  *   &nbsp; z&nbsp;&nbsp; &gt; 'Z';&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # match other instances
    173  *   of 'z'</code></p>
    174  * </blockquote>
    175  *
    176  * <p>It is also possible to match the beginning or the end of the text using a <code>UnicodeSet</code>.
    177  * This is done by including a virtual anchor character '<code>$</code>' at the end of the
    178  * set pattern. Although this is usually the match chafacter for the end anchor, the set will
    179  * match either the beginning or the end of the text, depending on its placement. For
    180  * example:</p>
    181  *
    182  * <blockquote>
    183  *   <p><code>$x = [a-z$]; &nbsp;&nbsp;# match 'a' through 'z' OR anchor<br>
    184  *   $x 1&nbsp;&nbsp;&nbsp; &gt; 2;&nbsp;&nbsp; # match '1' after a-z or at the start<br>
    185  *   &nbsp;&nbsp; 3 $x &gt; 4; &nbsp;&nbsp;# match '3' before a-z or at the end</code></p>
    186  * </blockquote>
    187  *
    188  * <p><b>Example</b> </p>
    189  *
    190  * <p>The following example rules illustrate many of the features of
    191  * the rule language. </p>
    192  *
    193  * <table border="0" cellpadding="4">
    194  *     <tr>
    195  *         <td valign="top">Rule 1.</td>
    196  *         <td valign="top" nowrap><code>abc{def}&gt;x|y</code></td>
    197  *     </tr>
    198  *     <tr>
    199  *         <td valign="top">Rule 2.</td>
    200  *         <td valign="top" nowrap><code>xyz&gt;r</code></td>
    201  *     </tr>
    202  *     <tr>
    203  *         <td valign="top">Rule 3.</td>
    204  *         <td valign="top" nowrap><code>yz&gt;q</code></td>
    205  *     </tr>
    206  * </table>
    207  *
    208  * <p>Applying these rules to the string &quot;<code>adefabcdefz</code>&quot;
    209  * yields the following results: </p>
    210  *
    211  * <table border="0" cellpadding="4">
    212  *     <tr>
    213  *         <td valign="top" nowrap><code>|adefabcdefz</code></td>
    214  *         <td valign="top">Initial state, no rules match. Advance
    215  *         cursor.</td>
    216  *     </tr>
    217  *     <tr>
    218  *         <td valign="top" nowrap><code>a|defabcdefz</code></td>
    219  *         <td valign="top">Still no match. Rule 1 does not match
    220  *         because the preceding context is not present.</td>
    221  *     </tr>
    222  *     <tr>
    223  *         <td valign="top" nowrap><code>ad|efabcdefz</code></td>
    224  *         <td valign="top">Still no match. Keep advancing until
    225  *         there is a match...</td>
    226  *     </tr>
    227  *     <tr>
    228  *         <td valign="top" nowrap><code>ade|fabcdefz</code></td>
    229  *         <td valign="top">...</td>
    230  *     </tr>
    231  *     <tr>
    232  *         <td valign="top" nowrap><code>adef|abcdefz</code></td>
    233  *         <td valign="top">...</td>
    234  *     </tr>
    235  *     <tr>
    236  *         <td valign="top" nowrap><code>adefa|bcdefz</code></td>
    237  *         <td valign="top">...</td>
    238  *     </tr>
    239  *     <tr>
    240  *         <td valign="top" nowrap><code>adefab|cdefz</code></td>
    241  *         <td valign="top">...</td>
    242  *     </tr>
    243  *     <tr>
    244  *         <td valign="top" nowrap><code>adefabc|defz</code></td>
    245  *         <td valign="top">Rule 1 matches; replace &quot;<code>def</code>&quot;
    246  *         with &quot;<code>xy</code>&quot; and back up the cursor
    247  *         to before the '<code>y</code>'.</td>
    248  *     </tr>
    249  *     <tr>
    250  *         <td valign="top" nowrap><code>adefabcx|yz</code></td>
    251  *         <td valign="top">Although &quot;<code>xyz</code>&quot; is
    252  *         present, rule 2 does not match because the cursor is
    253  *         before the '<code>y</code>', not before the '<code>x</code>'.
    254  *         Rule 3 does match. Replace &quot;<code>yz</code>&quot;
    255  *         with &quot;<code>q</code>&quot;.</td>
    256  *     </tr>
    257  *     <tr>
    258  *         <td valign="top" nowrap><code>adefabcxq|</code></td>
    259  *         <td valign="top">The cursor is at the end;
    260  *         transliteration is complete.</td>
    261  *     </tr>
    262  * </table>
    263  *
    264  * <p>The order of rules is significant. If multiple rules may match
    265  * at some point, the first matching rule is applied. </p>
    266  *
    267  * <p>Forward and reverse rules may have an empty output string.
    268  * Otherwise, an empty left or right hand side of any statement is a
    269  * syntax error. </p>
    270  *
    271  * <p>Single quotes are used to quote any character other than a
    272  * digit or letter. To specify a single quote itself, inside or
    273  * outside of quotes, use two single quotes in a row. For example,
    274  * the rule &quot;<code>'&gt;'&gt;o''clock</code>&quot; changes the
    275  * string &quot;<code>&gt;</code>&quot; to the string &quot;<code>o'clock</code>&quot;.
    276  * </p>
    277  *
    278  * <p><b>Notes</b> </p>
    279  *
    280  * <p>While a RuleBasedTransliterator is being built, it checks that
    281  * the rules are added in proper order. For example, if the rule
    282  * &quot;a&gt;x&quot; is followed by the rule &quot;ab&gt;y&quot;,
    283  * then the second rule will throw an exception. The reason is that
    284  * the second rule can never be triggered, since the first rule
    285  * always matches anything it matches. In other words, the first
    286  * rule <em>masks</em> the second rule. </p>
    287  *
    288  * @author Alan Liu
    289  * @internal Use transliterator factory methods instead since this class will be removed in that release.
    290  */
    291 class RuleBasedTransliterator : public Transliterator {
    292 private:
    293     /**
    294      * The data object is immutable, so we can freely share it with
    295      * other instances of RBT, as long as we do NOT own this object.
    296      *  TODO:  data is no longer immutable.  See bugs #1866, 2155
    297      */
    298     TransliterationRuleData* fData;
    299 
    300     /**
    301      * If true, we own the data object and must delete it.
    302      */
    303     UBool isDataOwned;
    304 
    305 public:
    306 
    307     /**
    308      * Constructs a new transliterator from the given rules.
    309      * @param rules rules, separated by ';'
    310      * @param direction either FORWARD or REVERSE.
    311      * @exception IllegalArgumentException if rules are malformed.
    312      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    313      */
    314     RuleBasedTransliterator(const UnicodeString& id,
    315                             const UnicodeString& rules,
    316                             UTransDirection direction,
    317                             UnicodeFilter* adoptedFilter,
    318                             UParseError& parseError,
    319                             UErrorCode& status);
    320 
    321     /**
    322      * Constructs a new transliterator from the given rules.
    323      * @param rules rules, separated by ';'
    324      * @param direction either FORWARD or REVERSE.
    325      * @exception IllegalArgumentException if rules are malformed.
    326      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    327      */
    328     /*RuleBasedTransliterator(const UnicodeString& id,
    329                             const UnicodeString& rules,
    330                             UTransDirection direction,
    331                             UnicodeFilter* adoptedFilter,
    332                             UErrorCode& status);*/
    333 
    334     /**
    335      * Covenience constructor with no filter.
    336      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    337      */
    338     /*RuleBasedTransliterator(const UnicodeString& id,
    339                             const UnicodeString& rules,
    340                             UTransDirection direction,
    341                             UErrorCode& status);*/
    342 
    343     /**
    344      * Covenience constructor with no filter and FORWARD direction.
    345      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    346      */
    347     /*RuleBasedTransliterator(const UnicodeString& id,
    348                             const UnicodeString& rules,
    349                             UErrorCode& status);*/
    350 
    351     /**
    352      * Covenience constructor with FORWARD direction.
    353      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    354      */
    355     /*RuleBasedTransliterator(const UnicodeString& id,
    356                             const UnicodeString& rules,
    357                             UnicodeFilter* adoptedFilter,
    358                             UErrorCode& status);*/
    359 private:
    360 
    361      friend class TransliteratorRegistry; // to access TransliterationRuleData convenience ctor
    362     /**
    363      * Covenience constructor.
    364      * @param id            the id for the transliterator.
    365      * @param theData       the rule data for the transliterator.
    366      * @param adoptedFilter the filter for the transliterator
    367      */
    368     RuleBasedTransliterator(const UnicodeString& id,
    369                             const TransliterationRuleData* theData,
    370                             UnicodeFilter* adoptedFilter = 0);
    371 
    372 
    373     friend class Transliterator; // to access following ct
    374 
    375     /**
    376      * Internal constructor.
    377      * @param id            the id for the transliterator.
    378      * @param theData       the rule data for the transliterator.
    379      * @param isDataAdopted determine who will own the 'data' object. True, the caller should not delete 'data'.
    380      */
    381     RuleBasedTransliterator(const UnicodeString& id,
    382                             TransliterationRuleData* data,
    383                             UBool isDataAdopted);
    384 
    385 public:
    386 
    387     /**
    388      * Copy constructor.
    389      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    390      */
    391     RuleBasedTransliterator(const RuleBasedTransliterator&);
    392 
    393     virtual ~RuleBasedTransliterator();
    394 
    395     /**
    396      * Implement Transliterator API.
    397      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    398      */
    399     virtual Transliterator* clone(void) const;
    400 
    401 protected:
    402     /**
    403      * Implements {@link Transliterator#handleTransliterate}.
    404      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    405      */
    406     virtual void handleTransliterate(Replaceable& text, UTransPosition& offsets,
    407                                      UBool isIncremental) const;
    408 
    409 public:
    410     /**
    411      * Return a representation of this transliterator as source rules.
    412      * These rules will produce an equivalent transliterator if used
    413      * to construct a new transliterator.
    414      * @param result the string to receive the rules.  Previous
    415      * contents will be deleted.
    416      * @param escapeUnprintable if TRUE then convert unprintable
    417      * character to their hex escape representations, \uxxxx or
    418      * \Uxxxxxxxx.  Unprintable characters are those other than
    419      * U+000A, U+0020..U+007E.
    420      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    421      */
    422     virtual UnicodeString& toRules(UnicodeString& result,
    423                                    UBool escapeUnprintable) const;
    424 
    425 protected:
    426     /**
    427      * Implement Transliterator framework
    428      */
    429     virtual void handleGetSourceSet(UnicodeSet& result) const;
    430 
    431 public:
    432     /**
    433      * Override Transliterator framework
    434      */
    435     virtual UnicodeSet& getTargetSet(UnicodeSet& result) const;
    436 
    437     /**
    438      * Return the class ID for this class.  This is useful only for
    439      * comparing to a return value from getDynamicClassID().  For example:
    440      * <pre>
    441      * .      Base* polymorphic_pointer = createPolymorphicObject();
    442      * .      if (polymorphic_pointer->getDynamicClassID() ==
    443      * .          Derived::getStaticClassID()) ...
    444      * </pre>
    445      * @return          The class ID for all objects of this class.
    446      * @internal Use transliterator factory methods instead since this class will be removed in that release.
    447      */
    448     U_I18N_API static UClassID U_EXPORT2 getStaticClassID(void);
    449 
    450     /**
    451      * Returns a unique class ID <b>polymorphically</b>.  This method
    452      * is to implement a simple version of RTTI, since not all C++
    453      * compilers support genuine RTTI.  Polymorphic operator==() and
    454      * clone() methods call this method.
    455      *
    456      * @return The class ID for this object. All objects of a given
    457      * class have the same class ID.  Objects of other classes have
    458      * different class IDs.
    459      */
    460     virtual UClassID getDynamicClassID(void) const;
    461 
    462 private:
    463 
    464     void _construct(const UnicodeString& rules,
    465                     UTransDirection direction,
    466                     UParseError& parseError,
    467                     UErrorCode& status);
    468 };
    469 
    470 
    471 U_NAMESPACE_END
    472 
    473 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    474 
    475 #endif
    476