Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /**
      4  ************************************************************************************
      5  * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
      6  * All Rights Reserved.                                                             *
      7  ************************************************************************************
      8  */
      9 
     10 #ifndef BRKENG_H
     11 #define BRKENG_H
     12 
     13 #include "unicode/utypes.h"
     14 #include "unicode/uobject.h"
     15 #include "unicode/utext.h"
     16 #include "unicode/uscript.h"
     17 
     18 U_NAMESPACE_BEGIN
     19 
     20 class UnicodeSet;
     21 class UStack;
     22 class UVector32;
     23 class DictionaryMatcher;
     24 
     25 /*******************************************************************
     26  * LanguageBreakEngine
     27  */
     28 
     29 /**
     30  * <p>LanguageBreakEngines implement language-specific knowledge for
     31  * finding text boundaries within a run of characters belonging to a
     32  * specific set. The boundaries will be of a specific kind, e.g. word,
     33  * line, etc.</p>
     34  *
     35  * <p>LanguageBreakEngines should normally be implemented so as to
     36  * be shared between threads without locking.</p>
     37  */
     38 class LanguageBreakEngine : public UMemory {
     39  public:
     40 
     41   /**
     42    * <p>Default constructor.</p>
     43    *
     44    */
     45   LanguageBreakEngine();
     46 
     47   /**
     48    * <p>Virtual destructor.</p>
     49    */
     50   virtual ~LanguageBreakEngine();
     51 
     52  /**
     53   * <p>Indicate whether this engine handles a particular character for
     54   * a particular kind of break.</p>
     55   *
     56   * @param c A character which begins a run that the engine might handle
     57   * @param breakType The type of text break which the caller wants to determine
     58   * @return TRUE if this engine handles the particular character and break
     59   * type.
     60   */
     61   virtual UBool handles(UChar32 c, int32_t breakType) const = 0;
     62 
     63  /**
     64   * <p>Find any breaks within a run in the supplied text.</p>
     65   *
     66   * @param text A UText representing the text. The
     67   * iterator is left at the end of the run of characters which the engine
     68   * is capable of handling.
     69   * @param startPos The start of the run within the supplied text.
     70   * @param endPos The end of the run within the supplied text.
     71   * @param breakType The type of break desired, or -1.
     72   * @param foundBreaks A Vector of int32_t to receive the breaks.
     73   * @return The number of breaks found.
     74   */
     75   virtual int32_t findBreaks( UText *text,
     76                               int32_t startPos,
     77                               int32_t endPos,
     78                               int32_t breakType,
     79                               UVector32 &foundBreaks ) const = 0;
     80 
     81 };
     82 
     83 /*******************************************************************
     84  * LanguageBreakFactory
     85  */
     86 
     87 /**
     88  * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
     89  * that can determine breaks for characters in a specific set, if
     90  * such an object can be found.</p>
     91  *
     92  * <p>If a LanguageBreakFactory is to be shared between threads,
     93  * appropriate synchronization must be used; there is none internal
     94  * to the factory.</p>
     95  *
     96  * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
     97  * normally be shared between threads without synchronization, unless
     98  * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
     99  *
    100  * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
    101  * it returns when it itself is deleted, unless the specific subclass of
    102  * LanguageBreakFactory indicates otherwise. Naturally, the factory should
    103  * not be deleted until the LanguageBreakEngines it has returned are no
    104  * longer needed.</p>
    105  */
    106 class LanguageBreakFactory : public UMemory {
    107  public:
    108 
    109   /**
    110    * <p>Default constructor.</p>
    111    *
    112    */
    113   LanguageBreakFactory();
    114 
    115   /**
    116    * <p>Virtual destructor.</p>
    117    */
    118   virtual ~LanguageBreakFactory();
    119 
    120  /**
    121   * <p>Find and return a LanguageBreakEngine that can find the desired
    122   * kind of break for the set of characters to which the supplied
    123   * character belongs. It is up to the set of available engines to
    124   * determine what the sets of characters are.</p>
    125   *
    126   * @param c A character that begins a run for which a LanguageBreakEngine is
    127   * sought.
    128   * @param breakType The kind of text break for which a LanguageBreakEngine is
    129   * sought.
    130   * @return A LanguageBreakEngine with the desired characteristics, or 0.
    131   */
    132   virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0;
    133 
    134 };
    135 
    136 /*******************************************************************
    137  * UnhandledEngine
    138  */
    139 
    140 /**
    141  * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
    142  * handles characters that no other LanguageBreakEngine is available to
    143  * handle. It is told the character and the type of break; at its
    144  * discretion it may handle more than the specified character (e.g.,
    145  * the entire script to which that character belongs.</p>
    146  *
    147  * <p>UnhandledEngines may not be shared between threads without
    148  * external synchronization.</p>
    149  */
    150 
    151 class UnhandledEngine : public LanguageBreakEngine {
    152  private:
    153 
    154     /**
    155      * The sets of characters handled, for each break type
    156      * @internal
    157      */
    158 
    159   UnicodeSet    *fHandled[4];
    160 
    161  public:
    162 
    163   /**
    164    * <p>Default constructor.</p>
    165    *
    166    */
    167   UnhandledEngine(UErrorCode &status);
    168 
    169   /**
    170    * <p>Virtual destructor.</p>
    171    */
    172   virtual ~UnhandledEngine();
    173 
    174  /**
    175   * <p>Indicate whether this engine handles a particular character for
    176   * a particular kind of break.</p>
    177   *
    178   * @param c A character which begins a run that the engine might handle
    179   * @param breakType The type of text break which the caller wants to determine
    180   * @return TRUE if this engine handles the particular character and break
    181   * type.
    182   */
    183   virtual UBool handles(UChar32 c, int32_t breakType) const;
    184 
    185  /**
    186   * <p>Find any breaks within a run in the supplied text.</p>
    187   *
    188   * @param text A UText representing the text (TODO: UText). The
    189   * iterator is left at the end of the run of characters which the engine
    190   * is capable of handling.
    191   * @param startPos The start of the run within the supplied text.
    192   * @param endPos The end of the run within the supplied text.
    193   * @param breakType The type of break desired, or -1.
    194   * @param foundBreaks An allocated C array of the breaks found, if any
    195   * @return The number of breaks found.
    196   */
    197   virtual int32_t findBreaks( UText *text,
    198                               int32_t startPos,
    199                               int32_t endPos,
    200                               int32_t breakType,
    201                               UVector32 &foundBreaks ) const;
    202 
    203  /**
    204   * <p>Tell the engine to handle a particular character and break type.</p>
    205   *
    206   * @param c A character which the engine should handle
    207   * @param breakType The type of text break for which the engine should handle c
    208   */
    209   virtual void handleCharacter(UChar32 c, int32_t breakType);
    210 
    211 };
    212 
    213 /*******************************************************************
    214  * ICULanguageBreakFactory
    215  */
    216 
    217 /**
    218  * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
    219  * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
    220  * data in the ICU data file.</p>
    221  */
    222 class ICULanguageBreakFactory : public LanguageBreakFactory {
    223  private:
    224 
    225     /**
    226      * The stack of break engines created by this factory
    227      * @internal
    228      */
    229 
    230   UStack    *fEngines;
    231 
    232  public:
    233 
    234   /**
    235    * <p>Standard constructor.</p>
    236    *
    237    */
    238   ICULanguageBreakFactory(UErrorCode &status);
    239 
    240   /**
    241    * <p>Virtual destructor.</p>
    242    */
    243   virtual ~ICULanguageBreakFactory();
    244 
    245  /**
    246   * <p>Find and return a LanguageBreakEngine that can find the desired
    247   * kind of break for the set of characters to which the supplied
    248   * character belongs. It is up to the set of available engines to
    249   * determine what the sets of characters are.</p>
    250   *
    251   * @param c A character that begins a run for which a LanguageBreakEngine is
    252   * sought.
    253   * @param breakType The kind of text break for which a LanguageBreakEngine is
    254   * sought.
    255   * @return A LanguageBreakEngine with the desired characteristics, or 0.
    256   */
    257   virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
    258 
    259 protected:
    260  /**
    261   * <p>Create a LanguageBreakEngine for the set of characters to which
    262   * the supplied character belongs, for the specified break type.</p>
    263   *
    264   * @param c A character that begins a run for which a LanguageBreakEngine is
    265   * sought.
    266   * @param breakType The kind of text break for which a LanguageBreakEngine is
    267   * sought.
    268   * @return A LanguageBreakEngine with the desired characteristics, or 0.
    269   */
    270   virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType);
    271 
    272   /**
    273    * <p>Create a DictionaryMatcher for the specified script and break type.</p>
    274    * @param script An ISO 15924 script code that identifies the dictionary to be
    275    * created.
    276    * @param breakType The kind of text break for which a dictionary is
    277    * sought.
    278    * @return A DictionaryMatcher with the desired characteristics, or NULL.
    279    */
    280   virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
    281 };
    282 
    283 U_NAMESPACE_END
    284 
    285     /* BRKENG_H */
    286 #endif
    287