Home | History | Annotate | Download | only in i18n
      1 /*
      2  **************************************************************************
      3  *   Copyright (c) 2002-2010, International Business Machines Corporation *
      4  *   and others.  All Rights Reserved.                                    *
      5  **************************************************************************
      6  *   Date        Name        Description                                  *
      7  *   01/28/2002  aliu        Creation.                                    *
      8  **************************************************************************
      9  */
     10 #ifndef TRIDPARS_H
     11 #define TRIDPARS_H
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_TRANSLITERATION
     16 
     17 #include "unicode/uobject.h"
     18 #include "unicode/unistr.h"
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 class Transliterator;
     23 class UnicodeSet;
     24 class UVector;
     25 
     26 /**
     27  * Parsing component for transliterator IDs.  This class contains only
     28  * static members; it cannot be instantiated.  Methods in this class
     29  * parse various ID formats, including the following:
     30  *
     31  * A basic ID, which contains source, target, and variant, but no
     32  * filter and no explicit inverse.  Examples include
     33  * "Latin-Greek/UNGEGN" and "Null".
     34  *
     35  * A single ID, which is a basic ID plus optional filter and optional
     36  * explicit inverse.  Examples include "[a-zA-Z] Latin-Greek" and
     37  * "Lower (Upper)".
     38  *
     39  * A compound ID, which is a sequence of one or more single IDs,
     40  * separated by semicolons, with optional forward and reverse global
     41  * filters.  The global filters are UnicodeSet patterns prepended or
     42  * appended to the IDs, separated by semicolons.  An appended filter
     43  * must be enclosed in parentheses and applies in the reverse
     44  * direction.
     45  *
     46  * @author Alan Liu
     47  */
     48 class TransliteratorIDParser /* not : public UObject because all methods are static */ {
     49 
     50  public:
     51 
     52     /**
     53      * A structure containing the parsed data of a filtered ID, that
     54      * is, a basic ID optionally with a filter.
     55      *
     56      * 'source' and 'target' will always be non-null.  The 'variant'
     57      * will be non-null only if a non-empty variant was parsed.
     58      *
     59      * 'sawSource' is true if there was an explicit source in the
     60      * parsed id.  If there was no explicit source, then an implied
     61      * source of ANY is returned and 'sawSource' is set to false.
     62      *
     63      * 'filter' is the parsed filter pattern, or null if there was no
     64      * filter.
     65      */
     66     class Specs : public UMemory {
     67     public:
     68         UnicodeString source; // not null
     69         UnicodeString target; // not null
     70         UnicodeString variant; // may be null
     71         UnicodeString filter; // may be null
     72         UBool sawSource;
     73         Specs(const UnicodeString& s, const UnicodeString& t,
     74               const UnicodeString& v, UBool sawS,
     75               const UnicodeString& f);
     76 
     77     private:
     78 
     79         Specs(const Specs &other); // forbid copying of this class
     80         Specs &operator=(const Specs &other); // forbid copying of this class
     81     };
     82 
     83     /**
     84      * A structure containing the canonicalized data of a filtered ID,
     85      * that is, a basic ID optionally with a filter.
     86      *
     87      * 'canonID' is always non-null.  It may be the empty string "".
     88      * It is the id that should be assigned to the created
     89      * transliterator.  It _cannot_ be instantiated directly.
     90      *
     91      * 'basicID' is always non-null and non-empty.  It is always of
     92      * the form S-T or S-T/V.  It is designed to be fed to low-level
     93      * instantiation code that only understands these two formats.
     94      *
     95      * 'filter' may be null, if there is none, or non-null and
     96      * non-empty.
     97      */
     98     class SingleID : public UMemory {
     99     public:
    100         UnicodeString canonID;
    101         UnicodeString basicID;
    102         UnicodeString filter;
    103         SingleID(const UnicodeString& c, const UnicodeString& b,
    104                  const UnicodeString& f);
    105         SingleID(const UnicodeString& c, const UnicodeString& b);
    106         Transliterator* createInstance();
    107 
    108     private:
    109 
    110         SingleID(const SingleID &other); // forbid copying of this class
    111         SingleID &operator=(const SingleID &other); // forbid copying of this class
    112     };
    113 
    114     /**
    115      * Parse a filter ID, that is, an ID of the general form
    116      * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
    117      * @param id the id to be parsed
    118      * @param pos INPUT-OUTPUT parameter.  On input, the position of
    119      * the first character to parse.  On output, the position after
    120      * the last character parsed.
    121      * @return a SingleID object or null if the parse fails
    122      */
    123     static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos);
    124 
    125     /**
    126      * Parse a single ID, that is, an ID of the general form
    127      * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
    128      * optional, the filters optional, and the variants optional.
    129      * @param id the id to be parsed
    130      * @param pos INPUT-OUTPUT parameter.  On input, the position of
    131      * the first character to parse.  On output, the position after
    132      * the last character parsed.
    133      * @param dir the direction.  If the direction is REVERSE then the
    134      * SingleID is constructed for the reverse direction.
    135      * @return a SingleID object or null
    136      */
    137     static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos,
    138                                   int32_t dir, UErrorCode& status);
    139 
    140     /**
    141      * Parse a global filter of the form "[f]" or "([f])", depending
    142      * on 'withParens'.
    143      * @param id the pattern the parse
    144      * @param pos INPUT-OUTPUT parameter.  On input, the position of
    145      * the first character to parse.  On output, the position after
    146      * the last character parsed.
    147      * @param dir the direction.
    148      * @param withParens INPUT-OUTPUT parameter.  On entry, if
    149      * withParens[0] is 0, then parens are disallowed.  If it is 1,
    150      * then parens are required.  If it is -1, then parens are
    151      * optional, and the return result will be set to 0 or 1.
    152      * @param canonID OUTPUT parameter.  The pattern for the filter
    153      * added to the canonID, either at the end, if dir is FORWARD, or
    154      * at the start, if dir is REVERSE.  The pattern will be enclosed
    155      * in parentheses if appropriate, and will be suffixed with an
    156      * ID_DELIM character.  May be null.
    157      * @return a UnicodeSet object or null.  A non-null results
    158      * indicates a successful parse, regardless of whether the filter
    159      * applies to the given direction.  The caller should discard it
    160      * if withParens != (dir == REVERSE).
    161      */
    162     static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos,
    163                                          int32_t dir,
    164                                          int32_t& withParens,
    165                                          UnicodeString* canonID);
    166 
    167     /**
    168      * Parse a compound ID, consisting of an optional forward global
    169      * filter, a separator, one or more single IDs delimited by
    170      * separators, an an optional reverse global filter.  The
    171      * separator is a semicolon.  The global filters are UnicodeSet
    172      * patterns.  The reverse global filter must be enclosed in
    173      * parentheses.
    174      * @param id the pattern the parse
    175      * @param dir the direction.
    176      * @param canonID OUTPUT parameter that receives the canonical ID,
    177      * consisting of canonical IDs for all elements, as returned by
    178      * parseSingleID(), separated by semicolons.  Previous contents
    179      * are discarded.
    180      * @param list OUTPUT parameter that receives a list of SingleID
    181      * objects representing the parsed IDs.  Previous contents are
    182      * discarded.
    183      * @param globalFilter OUTPUT parameter that receives a pointer to
    184      * a newly created global filter for this ID in this direction, or
    185      * null if there is none.
    186      * @return true if the parse succeeds, that is, if the entire
    187      * id is consumed without syntax error.
    188      */
    189     static UBool parseCompoundID(const UnicodeString& id, int32_t dir,
    190                                  UnicodeString& canonID,
    191                                  UVector& list,
    192                                  UnicodeSet*& globalFilter);
    193 
    194     /**
    195      * Convert the elements of the 'list' vector, which are SingleID
    196      * objects, into actual Transliterator objects.  In the course of
    197      * this, some (or all) entries may be removed.  If all entries
    198      * are removed, the Null transliterator will be added.
    199      *
    200      * Delete entries with empty basicIDs; these are generated by
    201      * elements like "(A)" in the forward direction, or "A()" in
    202      * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
    203      * SingleID entries to actual transliterators.
    204      *
    205      * @param list vector of SingleID objects.  On exit, vector
    206      * of one or more Transliterators.
    207      * @param ec Output param to receive a success or an error code.
    208      * @return new value of insertIndex.  The index will shift if
    209      * there are empty items, like "(Lower)", with indices less than
    210      * insertIndex.
    211      */
    212     static void instantiateList(UVector& list,
    213                                 UErrorCode& ec);
    214 
    215     /**
    216      * Parse an ID into pieces.  Take IDs of the form T, T/V, S-T,
    217      * S-T/V, or S/V-T.  If the source is missing, return a source of
    218      * ANY.
    219      * @param id the id string, in any of several forms
    220      * @param source          the given source.
    221      * @param target          the given target.
    222      * @param variant         the given variant
    223      * @param isSourcePresent If TRUE then the source is present.
    224      *                        If the source is not present, ANY will be
    225      *                        given as the source, and isSourcePresent will be null
    226      * @return an array of 4 strings: source, target, variant, and
    227      * isSourcePresent.  If the source is not present, ANY will be
    228      * given as the source, and isSourcePresent will be null.  Otherwise
    229      * isSourcePresent will be non-null.  The target may be empty if the
    230      * id is not well-formed.  The variant may be empty.
    231      */
    232     static void IDtoSTV(const UnicodeString& id,
    233                         UnicodeString& source,
    234                         UnicodeString& target,
    235                         UnicodeString& variant,
    236                         UBool& isSourcePresent);
    237 
    238     /**
    239      * Given source, target, and variant strings, concatenate them into a
    240      * full ID.  If the source is empty, then "Any" will be used for the
    241      * source, so the ID will always be of the form s-t/v or s-t.
    242      */
    243     static void STVtoID(const UnicodeString& source,
    244                         const UnicodeString& target,
    245                         const UnicodeString& variant,
    246                         UnicodeString& id);
    247 
    248     /**
    249      * Register two targets as being inverses of one another.  For
    250      * example, calling registerSpecialInverse("NFC", "NFD", true) causes
    251      * Transliterator to form the following inverse relationships:
    252      *
    253      * <pre>NFC => NFD
    254      * Any-NFC => Any-NFD
    255      * NFD => NFC
    256      * Any-NFD => Any-NFC</pre>
    257      *
    258      * (Without the special inverse registration, the inverse of NFC
    259      * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
    260      * that the presence or absence of "Any-" is preserved.
    261      *
    262      * <p>The relationship is symmetrical; registering (a, b) is
    263      * equivalent to registering (b, a).
    264      *
    265      * <p>The relevant IDs must still be registered separately as
    266      * factories or classes.
    267      *
    268      * <p>Only the targets are specified.  Special inverses always
    269      * have the form Any-Target1 <=> Any-Target2.  The target should
    270      * have canonical casing (the casing desired to be produced when
    271      * an inverse is formed) and should contain no whitespace or other
    272      * extraneous characters.
    273      *
    274      * @param target the target against which to register the inverse
    275      * @param inverseTarget the inverse of target, that is
    276      * Any-target.getInverse() => Any-inverseTarget
    277      * @param bidirectional if true, register the reverse relation
    278      * as well, that is, Any-inverseTarget.getInverse() => Any-target
    279      */
    280     static void registerSpecialInverse(const UnicodeString& target,
    281                                        const UnicodeString& inverseTarget,
    282                                        UBool bidirectional,
    283                                        UErrorCode &status);
    284 
    285     /**
    286      * Free static memory.
    287      */
    288     static void cleanup();
    289 
    290  private:
    291     //----------------------------------------------------------------
    292     // Private implementation
    293     //----------------------------------------------------------------
    294 
    295     // forbid instantiation
    296     TransliteratorIDParser();
    297 
    298     /**
    299      * Parse an ID into component pieces.  Take IDs of the form T,
    300      * T/V, S-T, S-T/V, or S/V-T.  If the source is missing, return a
    301      * source of ANY.
    302      * @param id the id string, in any of several forms
    303      * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
    304      * offset of the first character to parse in id.  On output,
    305      * pos[0] is the offset after the last parsed character.  If the
    306      * parse failed, pos[0] will be unchanged.
    307      * @param allowFilter if true, a UnicodeSet pattern is allowed
    308      * at any location between specs or delimiters, and is returned
    309      * as the fifth string in the array.
    310      * @return a Specs object, or null if the parse failed.  If
    311      * neither source nor target was seen in the parsed id, then the
    312      * parse fails.  If allowFilter is true, then the parsed filter
    313      * pattern is returned in the Specs object, otherwise the returned
    314      * filter reference is null.  If the parse fails for any reason
    315      * null is returned.
    316      */
    317     static Specs* parseFilterID(const UnicodeString& id, int32_t& pos,
    318                                 UBool allowFilter);
    319 
    320     /**
    321      * Givens a Specs object, convert it to a SingleID object.  The
    322      * Spec object is a more unprocessed parse result.  The SingleID
    323      * object contains information about canonical and basic IDs.
    324      * @param specs the given Specs object.
    325      * @param dir   either FORWARD or REVERSE.
    326      * @return a SingleID; never returns null.  Returned object always
    327      * has 'filter' field of null.
    328      */
    329     static SingleID* specsToID(const Specs* specs, int32_t dir);
    330 
    331     /**
    332      * Given a Specs object, return a SingleID representing the
    333      * special inverse of that ID.  If there is no special inverse
    334      * then return null.
    335      * @param specs the given Specs.
    336      * @return a SingleID or null.  Returned object always has
    337      * 'filter' field of null.
    338      */
    339     static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status);
    340 
    341     /**
    342      * Glue method to get around access problems in C++.
    343      * @param id the id string for the transliterator, in any of several forms
    344      * @param canonID the given canonical ID
    345      */
    346     static Transliterator* createBasicInstance(const UnicodeString& id,
    347                                                const UnicodeString* canonID);
    348 
    349     /**
    350      * Initialize static memory.
    351      */
    352     static void init(UErrorCode &status);
    353 
    354     friend class SingleID;
    355 };
    356 
    357 U_NAMESPACE_END
    358 
    359 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    360 
    361 #endif
    362