Home | History | Annotate | Download | only in common
      1 /**
      2  *******************************************************************************
      3  * Copyright (C) 2006,2012-2013, International Business Machines Corporation   *
      4  * and others. All Rights Reserved.                                            *
      5  *******************************************************************************
      6  */
      7 
      8 #ifndef DICTBE_H
      9 #define DICTBE_H
     10 
     11 #include "unicode/utypes.h"
     12 #include "unicode/uniset.h"
     13 #include "unicode/utext.h"
     14 
     15 #include "brkeng.h"
     16 
     17 U_NAMESPACE_BEGIN
     18 
     19 class DictionaryMatcher;
     20 
     21 /*******************************************************************
     22  * DictionaryBreakEngine
     23  */
     24 
     25 /**
     26  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
     27  * dictionary to determine language-specific breaks.</p>
     28  *
     29  * <p>After it is constructed a DictionaryBreakEngine may be shared between
     30  * threads without synchronization.</p>
     31  */
     32 class DictionaryBreakEngine : public LanguageBreakEngine {
     33  private:
     34     /**
     35      * The set of characters handled by this engine
     36      * @internal
     37      */
     38 
     39   UnicodeSet    fSet;
     40 
     41     /**
     42      * The set of break types handled by this engine
     43      * @internal
     44      */
     45 
     46   uint32_t      fTypes;
     47 
     48   /**
     49    * <p>Default constructor.</p>
     50    *
     51    */
     52   DictionaryBreakEngine();
     53 
     54  public:
     55 
     56   /**
     57    * <p>Constructor setting the break types handled.</p>
     58    *
     59    * @param breakTypes A bitmap of types handled by the engine.
     60    */
     61   DictionaryBreakEngine( uint32_t breakTypes );
     62 
     63   /**
     64    * <p>Virtual destructor.</p>
     65    */
     66   virtual ~DictionaryBreakEngine();
     67 
     68   /**
     69    * <p>Indicate whether this engine handles a particular character for
     70    * a particular kind of break.</p>
     71    *
     72    * @param c A character which begins a run that the engine might handle
     73    * @param breakType The type of text break which the caller wants to determine
     74    * @return TRUE if this engine handles the particular character and break
     75    * type.
     76    */
     77   virtual UBool handles( UChar32 c, int32_t breakType ) const;
     78 
     79   /**
     80    * <p>Find any breaks within a run in the supplied text.</p>
     81    *
     82    * @param text A UText representing the text. The iterator is left at
     83    * the end of the run of characters which the engine is capable of handling
     84    * that starts from the first (or last) character in the range.
     85    * @param startPos The start of the run within the supplied text.
     86    * @param endPos The end of the run within the supplied text.
     87    * @param reverse Whether the caller is looking for breaks in a reverse
     88    * direction.
     89    * @param breakType The type of break desired, or -1.
     90    * @param foundBreaks An allocated C array of the breaks found, if any
     91    * @return The number of breaks found.
     92    */
     93   virtual int32_t findBreaks( UText *text,
     94                               int32_t startPos,
     95                               int32_t endPos,
     96                               UBool reverse,
     97                               int32_t breakType,
     98                               UStack &foundBreaks ) const;
     99 
    100  protected:
    101 
    102  /**
    103   * <p>Set the character set handled by this engine.</p>
    104   *
    105   * @param set A UnicodeSet of the set of characters handled by the engine
    106   */
    107   virtual void setCharacters( const UnicodeSet &set );
    108 
    109  /**
    110   * <p>Set the break types handled by this engine.</p>
    111   *
    112   * @param breakTypes A bitmap of types handled by the engine.
    113   */
    114 //  virtual void setBreakTypes( uint32_t breakTypes );
    115 
    116  /**
    117   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    118   *
    119   * @param text A UText representing the text
    120   * @param rangeStart The start of the range of dictionary characters
    121   * @param rangeEnd The end of the range of dictionary characters
    122   * @param foundBreaks Output of C array of int32_t break positions, or 0
    123   * @return The number of breaks found
    124   */
    125   virtual int32_t divideUpDictionaryRange( UText *text,
    126                                            int32_t rangeStart,
    127                                            int32_t rangeEnd,
    128                                            UStack &foundBreaks ) const = 0;
    129 
    130 };
    131 
    132 /*******************************************************************
    133  * ThaiBreakEngine
    134  */
    135 
    136 /**
    137  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
    138  * dictionary and heuristics to determine Thai-specific breaks.</p>
    139  *
    140  * <p>After it is constructed a ThaiBreakEngine may be shared between
    141  * threads without synchronization.</p>
    142  */
    143 class ThaiBreakEngine : public DictionaryBreakEngine {
    144  private:
    145     /**
    146      * The set of characters handled by this engine
    147      * @internal
    148      */
    149 
    150   UnicodeSet                fThaiWordSet;
    151   UnicodeSet                fEndWordSet;
    152   UnicodeSet                fBeginWordSet;
    153   UnicodeSet                fSuffixSet;
    154   UnicodeSet                fMarkSet;
    155   DictionaryMatcher  *fDictionary;
    156 
    157  public:
    158 
    159   /**
    160    * <p>Default constructor.</p>
    161    *
    162    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    163    * engine is deleted.
    164    */
    165   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    166 
    167   /**
    168    * <p>Virtual destructor.</p>
    169    */
    170   virtual ~ThaiBreakEngine();
    171 
    172  protected:
    173  /**
    174   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    175   *
    176   * @param text A UText representing the text
    177   * @param rangeStart The start of the range of dictionary characters
    178   * @param rangeEnd The end of the range of dictionary characters
    179   * @param foundBreaks Output of C array of int32_t break positions, or 0
    180   * @return The number of breaks found
    181   */
    182   virtual int32_t divideUpDictionaryRange( UText *text,
    183                                            int32_t rangeStart,
    184                                            int32_t rangeEnd,
    185                                            UStack &foundBreaks ) const;
    186 
    187 };
    188 
    189 /*******************************************************************
    190  * LaoBreakEngine
    191  */
    192 
    193 /**
    194  * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
    195  * dictionary and heuristics to determine Lao-specific breaks.</p>
    196  *
    197  * <p>After it is constructed a LaoBreakEngine may be shared between
    198  * threads without synchronization.</p>
    199  */
    200 class LaoBreakEngine : public DictionaryBreakEngine {
    201  private:
    202     /**
    203      * The set of characters handled by this engine
    204      * @internal
    205      */
    206 
    207   UnicodeSet                fLaoWordSet;
    208   UnicodeSet                fEndWordSet;
    209   UnicodeSet                fBeginWordSet;
    210   UnicodeSet                fMarkSet;
    211   DictionaryMatcher  *fDictionary;
    212 
    213  public:
    214 
    215   /**
    216    * <p>Default constructor.</p>
    217    *
    218    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    219    * engine is deleted.
    220    */
    221   LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    222 
    223   /**
    224    * <p>Virtual destructor.</p>
    225    */
    226   virtual ~LaoBreakEngine();
    227 
    228  protected:
    229  /**
    230   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    231   *
    232   * @param text A UText representing the text
    233   * @param rangeStart The start of the range of dictionary characters
    234   * @param rangeEnd The end of the range of dictionary characters
    235   * @param foundBreaks Output of C array of int32_t break positions, or 0
    236   * @return The number of breaks found
    237   */
    238   virtual int32_t divideUpDictionaryRange( UText *text,
    239                                            int32_t rangeStart,
    240                                            int32_t rangeEnd,
    241                                            UStack &foundBreaks ) const;
    242 
    243 };
    244 
    245 /*******************************************************************
    246  * KhmerBreakEngine
    247  */
    248 
    249 /**
    250  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
    251  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
    252  *
    253  * <p>After it is constructed a KhmerBreakEngine may be shared between
    254  * threads without synchronization.</p>
    255  */
    256 class KhmerBreakEngine : public DictionaryBreakEngine {
    257  private:
    258     /**
    259      * The set of characters handled by this engine
    260      * @internal
    261      */
    262 
    263   UnicodeSet                fKhmerWordSet;
    264   UnicodeSet                fEndWordSet;
    265   UnicodeSet                fBeginWordSet;
    266   UnicodeSet                fMarkSet;
    267   DictionaryMatcher  *fDictionary;
    268 
    269  public:
    270 
    271   /**
    272    * <p>Default constructor.</p>
    273    *
    274    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    275    * engine is deleted.
    276    */
    277   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    278 
    279   /**
    280    * <p>Virtual destructor.</p>
    281    */
    282   virtual ~KhmerBreakEngine();
    283 
    284  protected:
    285  /**
    286   * <p>Divide up a range of known dictionary characters.</p>
    287   *
    288   * @param text A UText representing the text
    289   * @param rangeStart The start of the range of dictionary characters
    290   * @param rangeEnd The end of the range of dictionary characters
    291   * @param foundBreaks Output of C array of int32_t break positions, or 0
    292   * @return The number of breaks found
    293   */
    294   virtual int32_t divideUpDictionaryRange( UText *text,
    295                                            int32_t rangeStart,
    296                                            int32_t rangeEnd,
    297                                            UStack &foundBreaks ) const;
    298 
    299 };
    300 
    301 #if !UCONFIG_NO_NORMALIZATION
    302 
    303 /*******************************************************************
    304  * CjkBreakEngine
    305  */
    306 
    307 //indicates language/script that the CjkBreakEngine will handle
    308 enum LanguageType {
    309     kKorean,
    310     kChineseJapanese
    311 };
    312 
    313 /**
    314  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
    315  * dictionary with costs associated with each word and
    316  * Viterbi decoding to determine CJK-specific breaks.</p>
    317  */
    318 class CjkBreakEngine : public DictionaryBreakEngine {
    319  protected:
    320     /**
    321      * The set of characters handled by this engine
    322      * @internal
    323      */
    324   UnicodeSet                fHangulWordSet;
    325   UnicodeSet                fHanWordSet;
    326   UnicodeSet                fKatakanaWordSet;
    327   UnicodeSet                fHiraganaWordSet;
    328 
    329   DictionaryMatcher  *fDictionary;
    330 
    331  public:
    332 
    333     /**
    334      * <p>Default constructor.</p>
    335      *
    336      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    337      * engine is deleted. The DictionaryMatcher must contain costs for each word
    338      * in order for the dictionary to work properly.
    339      */
    340   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
    341 
    342     /**
    343      * <p>Virtual destructor.</p>
    344      */
    345   virtual ~CjkBreakEngine();
    346 
    347  protected:
    348     /**
    349      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    350      *
    351      * @param text A UText representing the text
    352      * @param rangeStart The start of the range of dictionary characters
    353      * @param rangeEnd The end of the range of dictionary characters
    354      * @param foundBreaks Output of C array of int32_t break positions, or 0
    355      * @return The number of breaks found
    356      */
    357   virtual int32_t divideUpDictionaryRange( UText *text,
    358           int32_t rangeStart,
    359           int32_t rangeEnd,
    360           UStack &foundBreaks ) const;
    361 
    362 };
    363 
    364 #endif
    365 
    366 U_NAMESPACE_END
    367 
    368     /* DICTBE_H */
    369 #endif
    370