Home | History | Annotate | Download | only in common
      1 /**
      2  ************************************************************************************
      3  * Copyright (C) 2006-2007, International Business Machines Corporation and others. *
      4  * All Rights Reserved.                                                             *
      5  ************************************************************************************
      6  */
      7 
      8 #ifndef BRKENG_H
      9 #define BRKENG_H
     10 
     11 #include "unicode/utypes.h"
     12 #include "unicode/uobject.h"
     13 #include "unicode/utext.h"
     14 #include "unicode/uscript.h"
     15 
     16 U_NAMESPACE_BEGIN
     17 
     18 class UnicodeSet;
     19 class UStack;
     20 class CompactTrieDictionary;
     21 
     22 /*******************************************************************
     23  * LanguageBreakEngine
     24  */
     25 
     26 /**
     27  * <p>LanguageBreakEngines implement language-specific knowledge for
     28  * finding text boundaries within a run of characters belonging to a
     29  * specific set. The boundaries will be of a specific kind, e.g. word,
     30  * line, etc.</p>
     31  *
     32  * <p>LanguageBreakEngines should normally be implemented so as to
     33  * be shared between threads without locking.</p>
     34  */
     35 class LanguageBreakEngine : public UMemory {
     36  public:
     37 
     38   /**
     39    * <p>Default constructor.</p>
     40    *
     41    */
     42   LanguageBreakEngine();
     43 
     44   /**
     45    * <p>Virtual destructor.</p>
     46    */
     47   virtual ~LanguageBreakEngine();
     48 
     49  /**
     50   * <p>Indicate whether this engine handles a particular character for
     51   * a particular kind of break.</p>
     52   *
     53   * @param c A character which begins a run that the engine might handle
     54   * @param breakType The type of text break which the caller wants to determine
     55   * @return TRUE if this engine handles the particular character and break
     56   * type.
     57   */
     58   virtual UBool handles(UChar32 c, int32_t breakType) const = 0;
     59 
     60  /**
     61   * <p>Find any breaks within a run in the supplied text.</p>
     62   *
     63   * @param text A UText representing the text. The
     64   * iterator is left at the end of the run of characters which the engine
     65   * is capable of handling.
     66   * @param startPos The start of the run within the supplied text.
     67   * @param endPos The end of the run within the supplied text.
     68   * @param reverse Whether the caller is looking for breaks in a reverse
     69   * direction.
     70   * @param breakType The type of break desired, or -1.
     71   * @param foundBreaks An allocated C array of the breaks found, if any
     72   * @return The number of breaks found.
     73   */
     74   virtual int32_t findBreaks( UText *text,
     75                               int32_t startPos,
     76                               int32_t endPos,
     77                               UBool reverse,
     78                               int32_t breakType,
     79                               UStack &foundBreaks ) const = 0;
     80 
     81 };
     82 
     83 /*******************************************************************
     84  * LanguageBreakFactory
     85  */
     86 
     87 /**
     88  * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
     89  * that can determine breaks for characters in a specific set, if
     90  * such an object can be found.</p>
     91  *
     92  * <p>If a LanguageBreakFactory is to be shared between threads,
     93  * appropriate synchronization must be used; there is none internal
     94  * to the factory.</p>
     95  *
     96  * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
     97  * normally be shared between threads without synchronization, unless
     98  * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
     99  *
    100  * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
    101  * it returns when it itself is deleted, unless the specific subclass of
    102  * LanguageBreakFactory indicates otherwise. Naturally, the factory should
    103  * not be deleted until the LanguageBreakEngines it has returned are no
    104  * longer needed.</p>
    105  */
    106 class LanguageBreakFactory : public UMemory {
    107  public:
    108 
    109   /**
    110    * <p>Default constructor.</p>
    111    *
    112    */
    113   LanguageBreakFactory();
    114 
    115   /**
    116    * <p>Virtual destructor.</p>
    117    */
    118   virtual ~LanguageBreakFactory();
    119 
    120  /**
    121   * <p>Find and return a LanguageBreakEngine that can find the desired
    122   * kind of break for the set of characters to which the supplied
    123   * character belongs. It is up to the set of available engines to
    124   * determine what the sets of characters are.</p>
    125   *
    126   * @param c A character that begins a run for which a LanguageBreakEngine is
    127   * sought.
    128   * @param breakType The kind of text break for which a LanguageBreakEngine is
    129   * sought.
    130   * @return A LanguageBreakEngine with the desired characteristics, or 0.
    131   */
    132   virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0;
    133 
    134 };
    135 
    136 /*******************************************************************
    137  * UnhandledEngine
    138  */
    139 
    140 /**
    141  * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
    142  * handles characters that no other LanguageBreakEngine is available to
    143  * handle. It is told the character and the type of break; at its
    144  * discretion it may handle more than the specified character (e.g.,
    145  * the entire script to which that character belongs.</p>
    146  *
    147  * <p>UnhandledEngines may not be shared between threads without
    148  * external synchronization.</p>
    149  */
    150 
    151 class UnhandledEngine : public LanguageBreakEngine {
    152  private:
    153 
    154     /**
    155      * The sets of characters handled, for each break type
    156      * @internal
    157      */
    158 
    159   UnicodeSet    *fHandled[4];
    160 
    161  public:
    162 
    163   /**
    164    * <p>Default constructor.</p>
    165    *
    166    */
    167   UnhandledEngine(UErrorCode &status);
    168 
    169   /**
    170    * <p>Virtual destructor.</p>
    171    */
    172   virtual ~UnhandledEngine();
    173 
    174  /**
    175   * <p>Indicate whether this engine handles a particular character for
    176   * a particular kind of break.</p>
    177   *
    178   * @param c A character which begins a run that the engine might handle
    179   * @param breakType The type of text break which the caller wants to determine
    180   * @return TRUE if this engine handles the particular character and break
    181   * type.
    182   */
    183   virtual UBool handles(UChar32 c, int32_t breakType) const;
    184 
    185  /**
    186   * <p>Find any breaks within a run in the supplied text.</p>
    187   *
    188   * @param text A UText representing the text (TODO: UText). The
    189   * iterator is left at the end of the run of characters which the engine
    190   * is capable of handling.
    191   * @param startPos The start of the run within the supplied text.
    192   * @param endPos The end of the run within the supplied text.
    193   * @param reverse Whether the caller is looking for breaks in a reverse
    194   * direction.
    195   * @param breakType The type of break desired, or -1.
    196   * @param foundBreaks An allocated C array of the breaks found, if any
    197   * @return The number of breaks found.
    198   */
    199   virtual int32_t findBreaks( UText *text,
    200                               int32_t startPos,
    201                               int32_t endPos,
    202                               UBool reverse,
    203                               int32_t breakType,
    204                               UStack &foundBreaks ) const;
    205 
    206  /**
    207   * <p>Tell the engine to handle a particular character and break type.</p>
    208   *
    209   * @param c A character which the engine should handle
    210   * @param breakType The type of text break for which the engine should handle c
    211   */
    212   virtual void handleCharacter(UChar32 c, int32_t breakType);
    213 
    214 };
    215 
    216 /*******************************************************************
    217  * ICULanguageBreakFactory
    218  */
    219 
    220 /**
    221  * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
    222  * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
    223  * data in the ICU data file.</p>
    224  */
    225 class ICULanguageBreakFactory : public LanguageBreakFactory {
    226  private:
    227 
    228     /**
    229      * The stack of break engines created by this factory
    230      * @internal
    231      */
    232 
    233   UStack    *fEngines;
    234 
    235  public:
    236 
    237   /**
    238    * <p>Standard constructor.</p>
    239    *
    240    */
    241   ICULanguageBreakFactory(UErrorCode &status);
    242 
    243   /**
    244    * <p>Virtual destructor.</p>
    245    */
    246   virtual ~ICULanguageBreakFactory();
    247 
    248  /**
    249   * <p>Find and return a LanguageBreakEngine that can find the desired
    250   * kind of break for the set of characters to which the supplied
    251   * character belongs. It is up to the set of available engines to
    252   * determine what the sets of characters are.</p>
    253   *
    254   * @param c A character that begins a run for which a LanguageBreakEngine is
    255   * sought.
    256   * @param breakType The kind of text break for which a LanguageBreakEngine is
    257   * sought.
    258   * @return A LanguageBreakEngine with the desired characteristics, or 0.
    259   */
    260   virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
    261 
    262  protected:
    263 
    264  /**
    265   * <p>Create a LanguageBreakEngine for the set of characters to which
    266   * the supplied character belongs, for the specified break type.</p>
    267   *
    268   * @param c A character that begins a run for which a LanguageBreakEngine is
    269   * sought.
    270   * @param breakType The kind of text break for which a LanguageBreakEngine is
    271   * sought.
    272   * @return A LanguageBreakEngine with the desired characteristics, or 0.
    273   */
    274   virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType);
    275 
    276  /**
    277   * <p>Create a CompactTrieDictionary for the specified script and break type.</p>
    278   *
    279   * @param script An ISO 15924 script code that identifies the dictionary to be
    280   * created.
    281   * @param breakType The kind of text break for which a dictionary is
    282   * sought.
    283   * @return A CompactTrieDictionary with the desired characteristics, or 0.
    284   */
    285   virtual const CompactTrieDictionary *loadDictionaryFor(UScriptCode script, int32_t breakType);
    286 
    287 };
    288 
    289 U_NAMESPACE_END
    290 
    291     /* BRKENG_H */
    292 #endif
    293