Home | History | Annotate | Download | only in unicode
      1 /********************************************************************
      2  * COPYRIGHT:
      3  * Copyright (c) 1997-2010, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  * Copyright (C) 2010 , Yahoo! Inc.
      6  ********************************************************************
      7  *
      8  * File SELFMT.H
      9  *
     10  * Modification History:
     11  *
     12  *   Date        Name        Description
     13  *   11/11/09    kirtig      Finished first cut of implementation.
     14  ********************************************************************/
     15 
     16 #ifndef SELFMT
     17 #define SELFMT
     18 
     19 #include "unicode/utypes.h"
     20 #include "unicode/numfmt.h"
     21 
     22 /**
     23  * \file
     24  * \brief C++ API: SelectFormat object
     25  */
     26 
     27 #if !UCONFIG_NO_FORMATTING
     28 
     29 U_NAMESPACE_BEGIN
     30 
     31 class Hashtable;
     32 
     33 /**
     34   * <p><code>SelectFormat</code> supports the creation of  internationalized
     35   * messages by selecting phrases based on keywords. The pattern  specifies
     36   * how to map keywords to phrases and provides a default phrase. The
     37   * object provided to the format method is a string that's matched
     38   * against the keywords. If there is a match, the corresponding phrase
     39   * is selected; otherwise, the default phrase is used.</p>
     40   *
     41   * <h4>Using <code>SelectFormat</code> for Gender Agreement</h4>
     42   *
     43   * <p>The main use case for the select format is gender based  inflection.
     44   * When names or nouns are inserted into sentences, their gender can  affect pronouns,
     45   * verb forms, articles, and adjectives. Special care needs to be
     46   * taken for the case where the gender cannot be determined.
     47   * The impact varies between languages:</p>
     48   * \htmlonly
     49   * <ul>
     50   * <li>English has three genders, and unknown gender is handled as a  special
     51   * case. Names use the gender of the named person (if known), nouns  referring
     52   * to people use natural gender, and inanimate objects are usually  neutral.
     53   * The gender only affects pronouns: "he", "she", "it", "they".
     54   *
     55   * <li>German differs from English in that the gender of nouns is  rather
     56   * arbitrary, even for nouns referring to people ("M&#x00E4;dchen", girl, is  neutral).
     57   * The gender affects pronouns ("er", "sie", "es"), articles ("der",  "die",
     58   * "das"), and adjective forms ("guter Mann", "gute Frau", "gutes  M&#x00E4;dchen").
     59   *
     60   * <li>French has only two genders; as in German the gender of nouns
     61   * is rather arbitrary - for sun and moon, the genders
     62   * are the opposite of those in German. The gender affects
     63   * pronouns ("il", "elle"), articles ("le", "la"),
     64   * adjective forms ("bon", "bonne"), and sometimes
     65   * verb forms ("all&#x00E9;", "all&#x00E9;e").
     66   *
     67   * <li>Polish distinguishes five genders (or noun classes),
     68   * human masculine, animate non-human masculine, inanimate masculine,
     69   * feminine, and neuter.
     70   * </ul>
     71   * \endhtmlonly
     72   * <p>Some other languages have noun classes that are not related to  gender,
     73   * but similar in grammatical use.
     74   * Some African languages have around 20 noun classes.</p>
     75   *
     76   * <p>To enable localizers to create sentence patterns that take their
     77   * language's gender dependencies into consideration, software has to  provide
     78   * information about the gender associated with a noun or name to
     79   * <code>MessageFormat</code>.
     80   * Two main cases can be distinguished:</p>
     81   *
     82   * <ul>
     83   * <li>For people, natural gender information should be maintained  for each person.
     84   * The keywords "male", "female", "mixed" (for groups of people)
     85   * and "unknown" are used.
     86   *
     87   * <li>For nouns, grammatical gender information should be maintained  for
     88   * each noun and per language, e.g., in resource bundles.
     89   * The keywords "masculine", "feminine", and "neuter" are commonly  used,
     90   * but some languages may require other keywords.
     91   * </ul>
     92   *
     93   * <p>The resulting keyword is provided to <code>MessageFormat</code>  as a
     94   * parameter separate from the name or noun it's associated with. For  example,
     95   * to generate a message such as "Jean went to Paris", three separate  arguments
     96   * would be provided: The name of the person as argument 0, the  gender of
     97   * the person as argument 1, and the name of the city as argument 2.
     98   * The sentence pattern for English, where the gender of the person has
     99   * no impact on this simple sentence, would not refer to argument 1  at all:</p>
    100   *
    101   * <pre>{0} went to {2}.</pre>
    102   *
    103   * <p>The sentence pattern for French, where the gender of the person affects
    104   * the form of the participle, uses a select format based on argument 1:</p>
    105   *
    106   * \htmlonly<pre>{0} est {1, select, female {all&#x00E9;e} other {all&#x00E9;}} &#x00E0; {2}.</pre>\endhtmlonly
    107   *
    108   * <p>Patterns can be nested, so that it's possible to handle  interactions of
    109   * number and gender where necessary. For example, if the above  sentence should
    110   * allow for the names of several people to be inserted, the  following sentence
    111   * pattern can be used (with argument 0 the list of people's names,
    112   * argument 1 the number of people, argument 2 their combined gender, and
    113   * argument 3 the city name):</p>
    114   *
    115   * \htmlonly
    116   * <pre>{0} {1, plural,
    117   *                 one {est {2, select, female {all&#x00E9;e} other  {all&#x00E9;}}}
    118   *                 other {sont {2, select, female {all&#x00E9;es} other {all&#x00E9;s}}}
    119   *          }&#x00E0; {3}.</pre>
    120   * \endhtmlonly
    121   *
    122   * <h4>Patterns and Their Interpretation</h4>
    123   *
    124   * <p>The <code>SelectFormat</code> pattern text defines the phrase  output
    125   * for each user-defined keyword.
    126   * The pattern is a sequence of <code><i>keyword</i>{<i>phrase</i>}</code>
    127   * clauses.
    128   * Each clause assigns the phrase <code><i>phrase</i></code>
    129   * to the user-defined <code><i>keyword</i></code>.</p>
    130   *
    131   * <p>Keywords must match the pattern [a-zA-Z][a-zA-Z0-9_-]*; keywords
    132   * that don't match this pattern result in the error code
    133   * <code>U_ILLEGAL_CHARACTER</code>.
    134   * You always have to define a phrase for the default keyword
    135   * <code>other</code>; this phrase is returned when the keyword
    136   * provided to
    137   * the <code>format</code> method matches no other keyword.
    138   * If a pattern does not provide a phrase for <code>other</code>, the  method
    139   * it's provided to returns the error  <code>U_DEFAULT_KEYWORD_MISSING</code>.
    140   * If a pattern provides more than one phrase for the same keyword, the
    141   * error <code>U_DUPLICATE_KEYWORD</code> is returned.
    142   * <br>
    143   * Spaces between <code><i>keyword</i></code> and
    144   * <code>{<i>phrase</i>}</code>  will be ignored; spaces within
    145   * <code>{<i>phrase</i>}</code> will be preserved.<p>
    146   *
    147   * <p>The phrase for a particular select case may contain other message
    148   * format patterns. <code>SelectFormat</code> preserves these so that  you
    149   * can use the strings produced by <code>SelectFormat</code> with other
    150   * formatters. If you are using <code>SelectFormat</code> inside a
    151   * <code>MessageFormat</code> pattern, <code>MessageFormat</code> will
    152   * automatically evaluate the resulting format pattern.
    153   * Thus, curly braces (<code>{</code>, <code>}</code>) are <i>only</i> allowed
    154   * in phrases to define a nested format pattern.</p>
    155   *
    156   * <p>Example:
    157   * \htmlonly
    158   *
    159   * UErrorCode status = U_ZERO_ERROR;
    160   * MessageFormat *msgFmt = new MessageFormat(UnicodeString("{0} est  {1, select, female {all&#x00E9;e} other {all&#x00E9;}} &#x00E0; Paris."), Locale("fr"),  status);
    161   * if (U_FAILURE(status)) {
    162   *       return;
    163   * }
    164   * FieldPosition ignore(FieldPosition::DONT_CARE);
    165   * UnicodeString result;
    166   *
    167   * char* str1= "Kirti,female";
    168   * Formattable args1[] = {"Kirti","female"};
    169   * msgFmt->format(args1, 2, result, ignore, status);
    170   * cout << "Input is " << str1 << " and result is: " << result << endl;
    171   * delete msgFmt;
    172   *
    173   * \endhtmlonly
    174   * </p>
    175   *
    176   * Produces the output:<br>
    177   * \htmlonly
    178   * <code>Kirti est all&#x00E9;e &#x00E0; Paris.</code>
    179   * \endhtmlonly
    180   *
    181   * @stable ICU 4.4
    182   */
    183 
    184 class U_I18N_API SelectFormat : public Format {
    185 public:
    186 
    187     /**
    188      * Creates a new <code>SelectFormat</code> for a given pattern string.
    189      * @param  pattern the pattern for this <code>SelectFormat</code>.
    190      *                 errors are returned to status if the pattern is invalid.
    191      * @param status   output param set to success/failure code on exit, which
    192      *                 must not indicate a failure before the function call.
    193      * @stable ICU 4.4
    194      */
    195     SelectFormat(const UnicodeString& pattern, UErrorCode& status);
    196 
    197     /**
    198      * copy constructor.
    199      * @stable ICU 4.4
    200      */
    201     SelectFormat(const SelectFormat& other);
    202 
    203     /**
    204      * Destructor.
    205      * @stable ICU 4.4
    206      */
    207     virtual ~SelectFormat();
    208 
    209     /**
    210      * Sets the pattern used by this select format.
    211      * for the keyword rules.
    212      * Patterns and their interpretation are specified in the class description.
    213      *
    214      * @param pattern the pattern for this select format
    215      *                errors are returned to status if the pattern is invalid.
    216      * @param status  output param set to success/failure code on exit, which
    217      *                must not indicate a failure before the function call.
    218      * @stable ICU 4.4
    219      */
    220     void applyPattern(const UnicodeString& pattern, UErrorCode& status);
    221 
    222 
    223     using Format::format;
    224 
    225     /**
    226      * Selects the phrase for  the given keyword
    227      *
    228      * @param keyword  The keyword that is used to select an alternative.
    229      * @param appendTo output parameter to receive result.
    230      *                 result is appended to existing contents.
    231      * @param pos      On input: an alignment field, if desired.
    232      *                 On output: the offsets of the alignment field.
    233      * @param status  output param set to success/failure code on exit, which
    234      *                 must not indicate a failure before the function call.
    235      * @return         Reference to 'appendTo' parameter.
    236      * @stable ICU 4.4
    237      */
    238     UnicodeString& format(const UnicodeString& keyword,
    239                             UnicodeString& appendTo,
    240                             FieldPosition& pos,
    241                             UErrorCode& status) const;
    242 
    243     /**
    244      * Assignment operator
    245      *
    246      * @param other    the SelectFormat object to copy from.
    247      * @stable ICU 4.4
    248      */
    249     SelectFormat& operator=(const SelectFormat& other);
    250 
    251     /**
    252      * Return true if another object is semantically equal to this one.
    253      *
    254      * @param other    the SelectFormat object to be compared with.
    255      * @return         true if other is semantically equal to this.
    256      * @stable ICU 4.4
    257      */
    258     virtual UBool operator==(const Format& other) const;
    259 
    260     /**
    261      * Return true if another object is semantically unequal to this one.
    262      *
    263      * @param other    the SelectFormat object to be compared with.
    264      * @return         true if other is semantically unequal to this.
    265      * @stable ICU 4.4
    266      */
    267     virtual UBool operator!=(const Format& other) const;
    268 
    269     /**
    270      * Clones this Format object polymorphically.  The caller owns the
    271      * result and should delete it when done.
    272      * @stable ICU 4.4
    273      */
    274     virtual Format* clone(void) const;
    275 
    276     /**
    277      * Format an object to produce a string.
    278      * This method handles keyword strings.
    279      * If the Formattable object is not a <code>UnicodeString</code>,
    280      * then it returns a failing UErrorCode.
    281      *
    282      * @param obj       A keyword string that is used to select an alternative.
    283      * @param appendTo  output parameter to receive result.
    284      *                  Result is appended to existing contents.
    285      * @param pos       On input: an alignment field, if desired.
    286      *                  On output: the offsets of the alignment field.
    287      * @param status    output param filled with success/failure status.
    288      * @return          Reference to 'appendTo' parameter.
    289      * @stable ICU 4.4
    290      */
    291     UnicodeString& format(const Formattable& obj,
    292                          UnicodeString& appendTo,
    293                          FieldPosition& pos,
    294                          UErrorCode& status) const;
    295 
    296     /**
    297      * Returns the pattern from applyPattern() or constructor.
    298      *
    299      * @param  appendTo  output parameter to receive result.
    300      *                  Result is appended to existing contents.
    301      * @return the UnicodeString with inserted pattern.
    302      * @stable ICU 4.4
    303      */
    304     UnicodeString& toPattern(UnicodeString& appendTo);
    305 
    306     /**
    307      * This method is not yet supported by <code>SelectFormat</code>.
    308      * <P>
    309      * Before calling, set parse_pos.index to the offset you want to start
    310      * parsing at in the source. After calling, parse_pos.index is the end of
    311      * the text you parsed. If error occurs, index is unchanged.
    312      * <P>
    313      * When parsing, leading whitespace is discarded (with a successful parse),
    314      * while trailing whitespace is left as is.
    315      * <P>
    316      * See Format::parseObject() for more.
    317      *
    318      * @param source     The string to be parsed into an object.
    319      * @param result     Formattable to be set to the parse result.
    320      *     If parse fails, return contents are undefined.
    321      * @param parse_pos The position to start parsing at. Upon return
    322      *     this param is set to the position after the
    323      *     last character successfully parsed. If the
    324      *     source is not parsed successfully, this param
    325      *     will remain unchanged.
    326      * @stable ICU 4.4
    327      */
    328     virtual void parseObject(const UnicodeString& source,
    329                             Formattable& result,
    330                             ParsePosition& parse_pos) const;
    331 
    332     /**
    333      * ICU "poor man's RTTI", returns a UClassID for this class.
    334      * @stable ICU 4.4
    335      */
    336     static UClassID U_EXPORT2 getStaticClassID(void);
    337 
    338     /**
    339      * ICU "poor man's RTTI", returns a UClassID for the actual class.
    340      * @stable ICU 4.4
    341      */
    342     virtual UClassID getDynamicClassID() const;
    343 
    344 private:
    345     typedef enum classesForSelectFormat{
    346         tStartKeyword,
    347         tContinueKeyword,
    348         tLeftBrace,
    349         tRightBrace,
    350         tSpace,
    351         tOther
    352     }CharacterClass;
    353 
    354     UnicodeString pattern;
    355     //Hash to store the keyword, phrase pairs.
    356     Hashtable  *parsedValuesHash;
    357 
    358     SelectFormat();   // default constructor not implemented.
    359     void initHashTable(UErrorCode &status);
    360     void cleanHashTable();
    361 
    362     //For the applyPattern , classifies char.s in one of the characterClass.
    363     CharacterClass classifyCharacter(UChar ch) const;
    364     //Checks if the "other" keyword is present in pattern.
    365     UBool checkSufficientDefinition();
    366     //Checks if the keyword passed is valid.
    367     UBool checkValidKeyword(const UnicodeString& argKeyword) const;
    368     void parsingFailure();
    369     void copyHashtable(Hashtable *other, UErrorCode& status);
    370 };
    371 
    372 U_NAMESPACE_END
    373 
    374 #endif /* #if !UCONFIG_NO_FORMATTING */
    375 
    376 #endif // _SELFMT
    377 //eof
    378