Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /**
      4  ************************************************************************************
      5  * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
      6  * All Rights Reserved.                                                             *
      7  ************************************************************************************
      8  */
      9 
     10 #ifndef BRKENG_H
     11 #define BRKENG_H
     12 
     13 #include "unicode/utypes.h"
     14 #include "unicode/uobject.h"
     15 #include "unicode/utext.h"
     16 #include "unicode/uscript.h"
     17 
     18 U_NAMESPACE_BEGIN
     19 
     20 class UnicodeSet;
     21 class UStack;
     22 class UVector32;
     23 class DictionaryMatcher;
     24 
     25 /*******************************************************************
     26  * LanguageBreakEngine
     27  */
     28 
     29 /**
     30  * <p>LanguageBreakEngines implement language-specific knowledge for
     31  * finding text boundaries within a run of characters belonging to a
     32  * specific set. The boundaries will be of a specific kind, e.g. word,
     33  * line, etc.</p>
     34  *
     35  * <p>LanguageBreakEngines should normally be implemented so as to
     36  * be shared between threads without locking.</p>
     37  */
     38 class LanguageBreakEngine : public UMemory {
     39  public:
     40 
     41   /**
     42    * <p>Default constructor.</p>
     43    *
     44    */
     45   LanguageBreakEngine();
     46 
     47   /**
     48    * <p>Virtual destructor.</p>
     49    */
     50   virtual ~LanguageBreakEngine();
     51 
     52  /**
     53   * <p>Indicate whether this engine handles a particular character for
     54   * a particular kind of break.</p>
     55   *
     56   * @param c A character which begins a run that the engine might handle
     57   * @return TRUE if this engine handles the particular character and break
     58   * type.
     59   */
     60   virtual UBool handles(UChar32 c) const = 0;
     61 
     62  /**
     63   * <p>Find any breaks within a run in the supplied text.</p>
     64   *
     65   * @param text A UText representing the text. The
     66   * iterator is left at the end of the run of characters which the engine
     67   * is capable of handling.
     68   * @param startPos The start of the run within the supplied text.
     69   * @param endPos The end of the run within the supplied text.
     70   * @param foundBreaks A Vector of int32_t to receive the breaks.
     71   * @return The number of breaks found.
     72   */
     73   virtual int32_t findBreaks( UText *text,
     74                               int32_t startPos,
     75                               int32_t endPos,
     76                               UVector32 &foundBreaks ) const = 0;
     77 
     78 };
     79 
     80 /*******************************************************************
     81  * LanguageBreakFactory
     82  */
     83 
     84 /**
     85  * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
     86  * that can determine breaks for characters in a specific set, if
     87  * such an object can be found.</p>
     88  *
     89  * <p>If a LanguageBreakFactory is to be shared between threads,
     90  * appropriate synchronization must be used; there is none internal
     91  * to the factory.</p>
     92  *
     93  * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
     94  * normally be shared between threads without synchronization, unless
     95  * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
     96  *
     97  * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
     98  * it returns when it itself is deleted, unless the specific subclass of
     99  * LanguageBreakFactory indicates otherwise. Naturally, the factory should
    100  * not be deleted until the LanguageBreakEngines it has returned are no
    101  * longer needed.</p>
    102  */
    103 class LanguageBreakFactory : public UMemory {
    104  public:
    105 
    106   /**
    107    * <p>Default constructor.</p>
    108    *
    109    */
    110   LanguageBreakFactory();
    111 
    112   /**
    113    * <p>Virtual destructor.</p>
    114    */
    115   virtual ~LanguageBreakFactory();
    116 
    117  /**
    118   * <p>Find and return a LanguageBreakEngine that can find the desired
    119   * kind of break for the set of characters to which the supplied
    120   * character belongs. It is up to the set of available engines to
    121   * determine what the sets of characters are.</p>
    122   *
    123   * @param c A character that begins a run for which a LanguageBreakEngine is
    124   * sought.
    125   * @return A LanguageBreakEngine with the desired characteristics, or 0.
    126   */
    127   virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
    128 
    129 };
    130 
    131 /*******************************************************************
    132  * UnhandledEngine
    133  */
    134 
    135 /**
    136  * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
    137  * handles characters that no other LanguageBreakEngine is available to
    138  * handle. It is told the character and the type of break; at its
    139  * discretion it may handle more than the specified character (e.g.,
    140  * the entire script to which that character belongs.</p>
    141  *
    142  * <p>UnhandledEngines may not be shared between threads without
    143  * external synchronization.</p>
    144  */
    145 
    146 class UnhandledEngine : public LanguageBreakEngine {
    147  private:
    148 
    149     /**
    150      * The sets of characters handled.
    151      * @internal
    152      */
    153 
    154   UnicodeSet    *fHandled;
    155 
    156  public:
    157 
    158   /**
    159    * <p>Default constructor.</p>
    160    *
    161    */
    162   UnhandledEngine(UErrorCode &status);
    163 
    164   /**
    165    * <p>Virtual destructor.</p>
    166    */
    167   virtual ~UnhandledEngine();
    168 
    169  /**
    170   * <p>Indicate whether this engine handles a particular character for
    171   * a particular kind of break.</p>
    172   *
    173   * @param c A character which begins a run that the engine might handle
    174   * @return TRUE if this engine handles the particular character and break
    175   * type.
    176   */
    177   virtual UBool handles(UChar32 c) const;
    178 
    179  /**
    180   * <p>Find any breaks within a run in the supplied text.</p>
    181   *
    182   * @param text A UText representing the text (TODO: UText). The
    183   * iterator is left at the end of the run of characters which the engine
    184   * is capable of handling.
    185   * @param startPos The start of the run within the supplied text.
    186   * @param endPos The end of the run within the supplied text.
    187   * @param foundBreaks An allocated C array of the breaks found, if any
    188   * @return The number of breaks found.
    189   */
    190   virtual int32_t findBreaks( UText *text,
    191                               int32_t startPos,
    192                               int32_t endPos,
    193                               UVector32 &foundBreaks ) const;
    194 
    195  /**
    196   * <p>Tell the engine to handle a particular character and break type.</p>
    197   *
    198   * @param c A character which the engine should handle
    199   */
    200   virtual void handleCharacter(UChar32 c);
    201 
    202 };
    203 
    204 /*******************************************************************
    205  * ICULanguageBreakFactory
    206  */
    207 
    208 /**
    209  * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
    210  * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
    211  * data in the ICU data file.</p>
    212  */
    213 class ICULanguageBreakFactory : public LanguageBreakFactory {
    214  private:
    215 
    216     /**
    217      * The stack of break engines created by this factory
    218      * @internal
    219      */
    220 
    221   UStack    *fEngines;
    222 
    223  public:
    224 
    225   /**
    226    * <p>Standard constructor.</p>
    227    *
    228    */
    229   ICULanguageBreakFactory(UErrorCode &status);
    230 
    231   /**
    232    * <p>Virtual destructor.</p>
    233    */
    234   virtual ~ICULanguageBreakFactory();
    235 
    236  /**
    237   * <p>Find and return a LanguageBreakEngine that can find the desired
    238   * kind of break for the set of characters to which the supplied
    239   * character belongs. It is up to the set of available engines to
    240   * determine what the sets of characters are.</p>
    241   *
    242   * @param c A character that begins a run for which a LanguageBreakEngine is
    243   * sought.
    244   * @return A LanguageBreakEngine with the desired characteristics, or 0.
    245   */
    246   virtual const LanguageBreakEngine *getEngineFor(UChar32 c);
    247 
    248 protected:
    249  /**
    250   * <p>Create a LanguageBreakEngine for the set of characters to which
    251   * the supplied character belongs, for the specified break type.</p>
    252   *
    253   * @param c A character that begins a run for which a LanguageBreakEngine is
    254   * sought.
    255   * @return A LanguageBreakEngine with the desired characteristics, or 0.
    256   */
    257   virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
    258 
    259   /**
    260    * <p>Create a DictionaryMatcher for the specified script and break type.</p>
    261    * @param script An ISO 15924 script code that identifies the dictionary to be
    262    * created.
    263    * @return A DictionaryMatcher with the desired characteristics, or NULL.
    264    */
    265   virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
    266 };
    267 
    268 U_NAMESPACE_END
    269 
    270     /* BRKENG_H */
    271 #endif
    272