Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /**
      4  ************************************************************************************
      5  * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
      6  * All Rights Reserved.                                                             *
      7  ************************************************************************************
      8  */
      9 
     10 #ifndef BRKENG_H
     11 #define BRKENG_H
     12 
     13 #include "unicode/utypes.h"
     14 #include "unicode/uobject.h"
     15 #include "unicode/utext.h"
     16 #include "unicode/uscript.h"
     17 
     18 U_NAMESPACE_BEGIN
     19 
     20 class UnicodeSet;
     21 class UStack;
     22 class DictionaryMatcher;
     23 
     24 /*******************************************************************
     25  * LanguageBreakEngine
     26  */
     27 
     28 /**
     29  * <p>LanguageBreakEngines implement language-specific knowledge for
     30  * finding text boundaries within a run of characters belonging to a
     31  * specific set. The boundaries will be of a specific kind, e.g. word,
     32  * line, etc.</p>
     33  *
     34  * <p>LanguageBreakEngines should normally be implemented so as to
     35  * be shared between threads without locking.</p>
     36  */
     37 class LanguageBreakEngine : public UMemory {
     38  public:
     39 
     40   /**
     41    * <p>Default constructor.</p>
     42    *
     43    */
     44   LanguageBreakEngine();
     45 
     46   /**
     47    * <p>Virtual destructor.</p>
     48    */
     49   virtual ~LanguageBreakEngine();
     50 
     51  /**
     52   * <p>Indicate whether this engine handles a particular character for
     53   * a particular kind of break.</p>
     54   *
     55   * @param c A character which begins a run that the engine might handle
     56   * @param breakType The type of text break which the caller wants to determine
     57   * @return TRUE if this engine handles the particular character and break
     58   * type.
     59   */
     60   virtual UBool handles(UChar32 c, int32_t breakType) const = 0;
     61 
     62  /**
     63   * <p>Find any breaks within a run in the supplied text.</p>
     64   *
     65   * @param text A UText representing the text. The
     66   * iterator is left at the end of the run of characters which the engine
     67   * is capable of handling.
     68   * @param startPos The start of the run within the supplied text.
     69   * @param endPos The end of the run within the supplied text.
     70   * @param reverse Whether the caller is looking for breaks in a reverse
     71   * direction.
     72   * @param breakType The type of break desired, or -1.
     73   * @param foundBreaks An allocated C array of the breaks found, if any
     74   * @return The number of breaks found.
     75   */
     76   virtual int32_t findBreaks( UText *text,
     77                               int32_t startPos,
     78                               int32_t endPos,
     79                               UBool reverse,
     80                               int32_t breakType,
     81                               UStack &foundBreaks ) const = 0;
     82 
     83 };
     84 
     85 /*******************************************************************
     86  * LanguageBreakFactory
     87  */
     88 
     89 /**
     90  * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
     91  * that can determine breaks for characters in a specific set, if
     92  * such an object can be found.</p>
     93  *
     94  * <p>If a LanguageBreakFactory is to be shared between threads,
     95  * appropriate synchronization must be used; there is none internal
     96  * to the factory.</p>
     97  *
     98  * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
     99  * normally be shared between threads without synchronization, unless
    100  * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
    101  *
    102  * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
    103  * it returns when it itself is deleted, unless the specific subclass of
    104  * LanguageBreakFactory indicates otherwise. Naturally, the factory should
    105  * not be deleted until the LanguageBreakEngines it has returned are no
    106  * longer needed.</p>
    107  */
    108 class LanguageBreakFactory : public UMemory {
    109  public:
    110 
    111   /**
    112    * <p>Default constructor.</p>
    113    *
    114    */
    115   LanguageBreakFactory();
    116 
    117   /**
    118    * <p>Virtual destructor.</p>
    119    */
    120   virtual ~LanguageBreakFactory();
    121 
    122  /**
    123   * <p>Find and return a LanguageBreakEngine that can find the desired
    124   * kind of break for the set of characters to which the supplied
    125   * character belongs. It is up to the set of available engines to
    126   * determine what the sets of characters are.</p>
    127   *
    128   * @param c A character that begins a run for which a LanguageBreakEngine is
    129   * sought.
    130   * @param breakType The kind of text break for which a LanguageBreakEngine is
    131   * sought.
    132   * @return A LanguageBreakEngine with the desired characteristics, or 0.
    133   */
    134   virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0;
    135 
    136 };
    137 
    138 /*******************************************************************
    139  * UnhandledEngine
    140  */
    141 
    142 /**
    143  * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
    144  * handles characters that no other LanguageBreakEngine is available to
    145  * handle. It is told the character and the type of break; at its
    146  * discretion it may handle more than the specified character (e.g.,
    147  * the entire script to which that character belongs.</p>
    148  *
    149  * <p>UnhandledEngines may not be shared between threads without
    150  * external synchronization.</p>
    151  */
    152 
    153 class UnhandledEngine : public LanguageBreakEngine {
    154  private:
    155 
    156     /**
    157      * The sets of characters handled, for each break type
    158      * @internal
    159      */
    160 
    161   UnicodeSet    *fHandled[4];
    162 
    163  public:
    164 
    165   /**
    166    * <p>Default constructor.</p>
    167    *
    168    */
    169   UnhandledEngine(UErrorCode &status);
    170 
    171   /**
    172    * <p>Virtual destructor.</p>
    173    */
    174   virtual ~UnhandledEngine();
    175 
    176  /**
    177   * <p>Indicate whether this engine handles a particular character for
    178   * a particular kind of break.</p>
    179   *
    180   * @param c A character which begins a run that the engine might handle
    181   * @param breakType The type of text break which the caller wants to determine
    182   * @return TRUE if this engine handles the particular character and break
    183   * type.
    184   */
    185   virtual UBool handles(UChar32 c, int32_t breakType) const;
    186 
    187  /**
    188   * <p>Find any breaks within a run in the supplied text.</p>
    189   *
    190   * @param text A UText representing the text (TODO: UText). The
    191   * iterator is left at the end of the run of characters which the engine
    192   * is capable of handling.
    193   * @param startPos The start of the run within the supplied text.
    194   * @param endPos The end of the run within the supplied text.
    195   * @param reverse Whether the caller is looking for breaks in a reverse
    196   * direction.
    197   * @param breakType The type of break desired, or -1.
    198   * @param foundBreaks An allocated C array of the breaks found, if any
    199   * @return The number of breaks found.
    200   */
    201   virtual int32_t findBreaks( UText *text,
    202                               int32_t startPos,
    203                               int32_t endPos,
    204                               UBool reverse,
    205                               int32_t breakType,
    206                               UStack &foundBreaks ) const;
    207 
    208  /**
    209   * <p>Tell the engine to handle a particular character and break type.</p>
    210   *
    211   * @param c A character which the engine should handle
    212   * @param breakType The type of text break for which the engine should handle c
    213   */
    214   virtual void handleCharacter(UChar32 c, int32_t breakType);
    215 
    216 };
    217 
    218 /*******************************************************************
    219  * ICULanguageBreakFactory
    220  */
    221 
    222 /**
    223  * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
    224  * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
    225  * data in the ICU data file.</p>
    226  */
    227 class ICULanguageBreakFactory : public LanguageBreakFactory {
    228  private:
    229 
    230     /**
    231      * The stack of break engines created by this factory
    232      * @internal
    233      */
    234 
    235   UStack    *fEngines;
    236 
    237  public:
    238 
    239   /**
    240    * <p>Standard constructor.</p>
    241    *
    242    */
    243   ICULanguageBreakFactory(UErrorCode &status);
    244 
    245   /**
    246    * <p>Virtual destructor.</p>
    247    */
    248   virtual ~ICULanguageBreakFactory();
    249 
    250  /**
    251   * <p>Find and return a LanguageBreakEngine that can find the desired
    252   * kind of break for the set of characters to which the supplied
    253   * character belongs. It is up to the set of available engines to
    254   * determine what the sets of characters are.</p>
    255   *
    256   * @param c A character that begins a run for which a LanguageBreakEngine is
    257   * sought.
    258   * @param breakType The kind of text break for which a LanguageBreakEngine is
    259   * sought.
    260   * @return A LanguageBreakEngine with the desired characteristics, or 0.
    261   */
    262   virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
    263 
    264 protected:
    265  /**
    266   * <p>Create a LanguageBreakEngine for the set of characters to which
    267   * the supplied character belongs, for the specified break type.</p>
    268   *
    269   * @param c A character that begins a run for which a LanguageBreakEngine is
    270   * sought.
    271   * @param breakType The kind of text break for which a LanguageBreakEngine is
    272   * sought.
    273   * @return A LanguageBreakEngine with the desired characteristics, or 0.
    274   */
    275   virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType);
    276 
    277   /**
    278    * <p>Create a DictionaryMatcher for the specified script and break type.</p>
    279    * @param script An ISO 15924 script code that identifies the dictionary to be
    280    * created.
    281    * @param breakType The kind of text break for which a dictionary is
    282    * sought.
    283    * @return A DictionaryMatcher with the desired characteristics, or NULL.
    284    */
    285   virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
    286 };
    287 
    288 U_NAMESPACE_END
    289 
    290     /* BRKENG_H */
    291 #endif
    292