Home | History | Annotate | Download | only in common
      1 /**
      2  *******************************************************************************
      3  * Copyright (C) 2006-2014, International Business Machines Corporation   *
      4  * and others. All Rights Reserved.                                            *
      5  *******************************************************************************
      6  */
      7 
      8 #ifndef DICTBE_H
      9 #define DICTBE_H
     10 
     11 #include "unicode/utypes.h"
     12 #include "unicode/uniset.h"
     13 #include "unicode/utext.h"
     14 
     15 #include "brkeng.h"
     16 
     17 U_NAMESPACE_BEGIN
     18 
     19 class DictionaryMatcher;
     20 class Normalizer2;
     21 
     22 /*******************************************************************
     23  * DictionaryBreakEngine
     24  */
     25 
     26 /**
     27  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
     28  * dictionary to determine language-specific breaks.</p>
     29  *
     30  * <p>After it is constructed a DictionaryBreakEngine may be shared between
     31  * threads without synchronization.</p>
     32  */
     33 class DictionaryBreakEngine : public LanguageBreakEngine {
     34  private:
     35     /**
     36      * The set of characters handled by this engine
     37      * @internal
     38      */
     39 
     40   UnicodeSet    fSet;
     41 
     42     /**
     43      * The set of break types handled by this engine
     44      * @internal
     45      */
     46 
     47   uint32_t      fTypes;
     48 
     49   /**
     50    * <p>Default constructor.</p>
     51    *
     52    */
     53   DictionaryBreakEngine();
     54 
     55  public:
     56 
     57   /**
     58    * <p>Constructor setting the break types handled.</p>
     59    *
     60    * @param breakTypes A bitmap of types handled by the engine.
     61    */
     62   DictionaryBreakEngine( uint32_t breakTypes );
     63 
     64   /**
     65    * <p>Virtual destructor.</p>
     66    */
     67   virtual ~DictionaryBreakEngine();
     68 
     69   /**
     70    * <p>Indicate whether this engine handles a particular character for
     71    * a particular kind of break.</p>
     72    *
     73    * @param c A character which begins a run that the engine might handle
     74    * @param breakType The type of text break which the caller wants to determine
     75    * @return TRUE if this engine handles the particular character and break
     76    * type.
     77    */
     78   virtual UBool handles( UChar32 c, int32_t breakType ) const;
     79 
     80   /**
     81    * <p>Find any breaks within a run in the supplied text.</p>
     82    *
     83    * @param text A UText representing the text. The iterator is left at
     84    * the end of the run of characters which the engine is capable of handling
     85    * that starts from the first (or last) character in the range.
     86    * @param startPos The start of the run within the supplied text.
     87    * @param endPos The end of the run within the supplied text.
     88    * @param reverse Whether the caller is looking for breaks in a reverse
     89    * direction.
     90    * @param breakType The type of break desired, or -1.
     91    * @param foundBreaks An allocated C array of the breaks found, if any
     92    * @return The number of breaks found.
     93    */
     94   virtual int32_t findBreaks( UText *text,
     95                               int32_t startPos,
     96                               int32_t endPos,
     97                               UBool reverse,
     98                               int32_t breakType,
     99                               UStack &foundBreaks ) const;
    100 
    101  protected:
    102 
    103  /**
    104   * <p>Set the character set handled by this engine.</p>
    105   *
    106   * @param set A UnicodeSet of the set of characters handled by the engine
    107   */
    108   virtual void setCharacters( const UnicodeSet &set );
    109 
    110  /**
    111   * <p>Set the break types handled by this engine.</p>
    112   *
    113   * @param breakTypes A bitmap of types handled by the engine.
    114   */
    115 //  virtual void setBreakTypes( uint32_t breakTypes );
    116 
    117  /**
    118   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    119   *
    120   * @param text A UText representing the text
    121   * @param rangeStart The start of the range of dictionary characters
    122   * @param rangeEnd The end of the range of dictionary characters
    123   * @param foundBreaks Output of C array of int32_t break positions, or 0
    124   * @return The number of breaks found
    125   */
    126   virtual int32_t divideUpDictionaryRange( UText *text,
    127                                            int32_t rangeStart,
    128                                            int32_t rangeEnd,
    129                                            UStack &foundBreaks ) const = 0;
    130 
    131 };
    132 
    133 /*******************************************************************
    134  * ThaiBreakEngine
    135  */
    136 
    137 /**
    138  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
    139  * dictionary and heuristics to determine Thai-specific breaks.</p>
    140  *
    141  * <p>After it is constructed a ThaiBreakEngine may be shared between
    142  * threads without synchronization.</p>
    143  */
    144 class ThaiBreakEngine : public DictionaryBreakEngine {
    145  private:
    146     /**
    147      * The set of characters handled by this engine
    148      * @internal
    149      */
    150 
    151   UnicodeSet                fThaiWordSet;
    152   UnicodeSet                fEndWordSet;
    153   UnicodeSet                fBeginWordSet;
    154   UnicodeSet                fSuffixSet;
    155   UnicodeSet                fMarkSet;
    156   DictionaryMatcher  *fDictionary;
    157 
    158  public:
    159 
    160   /**
    161    * <p>Default constructor.</p>
    162    *
    163    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    164    * engine is deleted.
    165    */
    166   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    167 
    168   /**
    169    * <p>Virtual destructor.</p>
    170    */
    171   virtual ~ThaiBreakEngine();
    172 
    173  protected:
    174  /**
    175   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    176   *
    177   * @param text A UText representing the text
    178   * @param rangeStart The start of the range of dictionary characters
    179   * @param rangeEnd The end of the range of dictionary characters
    180   * @param foundBreaks Output of C array of int32_t break positions, or 0
    181   * @return The number of breaks found
    182   */
    183   virtual int32_t divideUpDictionaryRange( UText *text,
    184                                            int32_t rangeStart,
    185                                            int32_t rangeEnd,
    186                                            UStack &foundBreaks ) const;
    187 
    188 };
    189 
    190 /*******************************************************************
    191  * LaoBreakEngine
    192  */
    193 
    194 /**
    195  * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
    196  * dictionary and heuristics to determine Lao-specific breaks.</p>
    197  *
    198  * <p>After it is constructed a LaoBreakEngine may be shared between
    199  * threads without synchronization.</p>
    200  */
    201 class LaoBreakEngine : public DictionaryBreakEngine {
    202  private:
    203     /**
    204      * The set of characters handled by this engine
    205      * @internal
    206      */
    207 
    208   UnicodeSet                fLaoWordSet;
    209   UnicodeSet                fEndWordSet;
    210   UnicodeSet                fBeginWordSet;
    211   UnicodeSet                fMarkSet;
    212   DictionaryMatcher  *fDictionary;
    213 
    214  public:
    215 
    216   /**
    217    * <p>Default constructor.</p>
    218    *
    219    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    220    * engine is deleted.
    221    */
    222   LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    223 
    224   /**
    225    * <p>Virtual destructor.</p>
    226    */
    227   virtual ~LaoBreakEngine();
    228 
    229  protected:
    230  /**
    231   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    232   *
    233   * @param text A UText representing the text
    234   * @param rangeStart The start of the range of dictionary characters
    235   * @param rangeEnd The end of the range of dictionary characters
    236   * @param foundBreaks Output of C array of int32_t break positions, or 0
    237   * @return The number of breaks found
    238   */
    239   virtual int32_t divideUpDictionaryRange( UText *text,
    240                                            int32_t rangeStart,
    241                                            int32_t rangeEnd,
    242                                            UStack &foundBreaks ) const;
    243 
    244 };
    245 
    246 /*******************************************************************
    247  * BurmeseBreakEngine
    248  */
    249 
    250 /**
    251  * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
    252  * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
    253  *
    254  * <p>After it is constructed a BurmeseBreakEngine may be shared between
    255  * threads without synchronization.</p>
    256  */
    257 class BurmeseBreakEngine : public DictionaryBreakEngine {
    258  private:
    259     /**
    260      * The set of characters handled by this engine
    261      * @internal
    262      */
    263 
    264   UnicodeSet                fBurmeseWordSet;
    265   UnicodeSet                fEndWordSet;
    266   UnicodeSet                fBeginWordSet;
    267   UnicodeSet                fMarkSet;
    268   DictionaryMatcher  *fDictionary;
    269 
    270  public:
    271 
    272   /**
    273    * <p>Default constructor.</p>
    274    *
    275    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    276    * engine is deleted.
    277    */
    278   BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    279 
    280   /**
    281    * <p>Virtual destructor.</p>
    282    */
    283   virtual ~BurmeseBreakEngine();
    284 
    285  protected:
    286  /**
    287   * <p>Divide up a range of known dictionary characters.</p>
    288   *
    289   * @param text A UText representing the text
    290   * @param rangeStart The start of the range of dictionary characters
    291   * @param rangeEnd The end of the range of dictionary characters
    292   * @param foundBreaks Output of C array of int32_t break positions, or 0
    293   * @return The number of breaks found
    294   */
    295   virtual int32_t divideUpDictionaryRange( UText *text,
    296                                            int32_t rangeStart,
    297                                            int32_t rangeEnd,
    298                                            UStack &foundBreaks ) const;
    299 
    300 };
    301 
    302 /*******************************************************************
    303  * KhmerBreakEngine
    304  */
    305 
    306 /**
    307  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
    308  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
    309  *
    310  * <p>After it is constructed a KhmerBreakEngine may be shared between
    311  * threads without synchronization.</p>
    312  */
    313 class KhmerBreakEngine : public DictionaryBreakEngine {
    314  private:
    315     /**
    316      * The set of characters handled by this engine
    317      * @internal
    318      */
    319 
    320   UnicodeSet                fKhmerWordSet;
    321   UnicodeSet                fEndWordSet;
    322   UnicodeSet                fBeginWordSet;
    323   UnicodeSet                fMarkSet;
    324   DictionaryMatcher  *fDictionary;
    325 
    326  public:
    327 
    328   /**
    329    * <p>Default constructor.</p>
    330    *
    331    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    332    * engine is deleted.
    333    */
    334   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    335 
    336   /**
    337    * <p>Virtual destructor.</p>
    338    */
    339   virtual ~KhmerBreakEngine();
    340 
    341  protected:
    342  /**
    343   * <p>Divide up a range of known dictionary characters.</p>
    344   *
    345   * @param text A UText representing the text
    346   * @param rangeStart The start of the range of dictionary characters
    347   * @param rangeEnd The end of the range of dictionary characters
    348   * @param foundBreaks Output of C array of int32_t break positions, or 0
    349   * @return The number of breaks found
    350   */
    351   virtual int32_t divideUpDictionaryRange( UText *text,
    352                                            int32_t rangeStart,
    353                                            int32_t rangeEnd,
    354                                            UStack &foundBreaks ) const;
    355 
    356 };
    357 
    358 #if !UCONFIG_NO_NORMALIZATION
    359 
    360 /*******************************************************************
    361  * CjkBreakEngine
    362  */
    363 
    364 //indicates language/script that the CjkBreakEngine will handle
    365 enum LanguageType {
    366     kKorean,
    367     kChineseJapanese
    368 };
    369 
    370 /**
    371  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
    372  * dictionary with costs associated with each word and
    373  * Viterbi decoding to determine CJK-specific breaks.</p>
    374  */
    375 class CjkBreakEngine : public DictionaryBreakEngine {
    376  protected:
    377     /**
    378      * The set of characters handled by this engine
    379      * @internal
    380      */
    381   UnicodeSet                fHangulWordSet;
    382   UnicodeSet                fHanWordSet;
    383   UnicodeSet                fKatakanaWordSet;
    384   UnicodeSet                fHiraganaWordSet;
    385 
    386   DictionaryMatcher        *fDictionary;
    387   const Normalizer2        *nfkcNorm2;
    388 
    389  public:
    390 
    391     /**
    392      * <p>Default constructor.</p>
    393      *
    394      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    395      * engine is deleted. The DictionaryMatcher must contain costs for each word
    396      * in order for the dictionary to work properly.
    397      */
    398   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
    399 
    400     /**
    401      * <p>Virtual destructor.</p>
    402      */
    403   virtual ~CjkBreakEngine();
    404 
    405  protected:
    406     /**
    407      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    408      *
    409      * @param text A UText representing the text
    410      * @param rangeStart The start of the range of dictionary characters
    411      * @param rangeEnd The end of the range of dictionary characters
    412      * @param foundBreaks Output of C array of int32_t break positions, or 0
    413      * @return The number of breaks found
    414      */
    415   virtual int32_t divideUpDictionaryRange( UText *text,
    416           int32_t rangeStart,
    417           int32_t rangeEnd,
    418           UStack &foundBreaks ) const;
    419 
    420 };
    421 
    422 #endif
    423 
    424 U_NAMESPACE_END
    425 
    426     /* DICTBE_H */
    427 #endif
    428