Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /**
      4  *******************************************************************************
      5  * Copyright (C) 2006-2014, International Business Machines Corporation   *
      6  * and others. All Rights Reserved.                                            *
      7  *******************************************************************************
      8  */
      9 
     10 #ifndef DICTBE_H
     11 #define DICTBE_H
     12 
     13 #include "unicode/utypes.h"
     14 #include "unicode/uniset.h"
     15 #include "unicode/utext.h"
     16 
     17 #include "brkeng.h"
     18 #include "uvectr32.h"
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 class DictionaryMatcher;
     23 class Normalizer2;
     24 
     25 /*******************************************************************
     26  * DictionaryBreakEngine
     27  */
     28 
     29 /**
     30  * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
     31  * dictionary to determine language-specific breaks.</p>
     32  *
     33  * <p>After it is constructed a DictionaryBreakEngine may be shared between
     34  * threads without synchronization.</p>
     35  */
     36 class DictionaryBreakEngine : public LanguageBreakEngine {
     37  private:
     38     /**
     39      * The set of characters handled by this engine
     40      * @internal
     41      */
     42 
     43   UnicodeSet    fSet;
     44 
     45     /**
     46      * The set of break types handled by this engine
     47      * @internal
     48      */
     49 
     50   uint32_t      fTypes;
     51 
     52   /**
     53    * <p>Default constructor.</p>
     54    *
     55    */
     56   DictionaryBreakEngine();
     57 
     58  public:
     59 
     60   /**
     61    * <p>Constructor setting the break types handled.</p>
     62    *
     63    * @param breakTypes A bitmap of types handled by the engine.
     64    */
     65   DictionaryBreakEngine( uint32_t breakTypes );
     66 
     67   /**
     68    * <p>Virtual destructor.</p>
     69    */
     70   virtual ~DictionaryBreakEngine();
     71 
     72   /**
     73    * <p>Indicate whether this engine handles a particular character for
     74    * a particular kind of break.</p>
     75    *
     76    * @param c A character which begins a run that the engine might handle
     77    * @param breakType The type of text break which the caller wants to determine
     78    * @return TRUE if this engine handles the particular character and break
     79    * type.
     80    */
     81   virtual UBool handles( UChar32 c, int32_t breakType ) const;
     82 
     83   /**
     84    * <p>Find any breaks within a run in the supplied text.</p>
     85    *
     86    * @param text A UText representing the text. The iterator is left at
     87    * the end of the run of characters which the engine is capable of handling
     88    * that starts from the first character in the range.
     89    * @param startPos The start of the run within the supplied text.
     90    * @param endPos The end of the run within the supplied text.
     91    * @param breakType The type of break desired, or -1.
     92    * @param foundBreaks vector of int32_t to receive the break positions
     93    * @return The number of breaks found.
     94    */
     95   virtual int32_t findBreaks( UText *text,
     96                               int32_t startPos,
     97                               int32_t endPos,
     98                               int32_t breakType,
     99                               UVector32 &foundBreaks ) const;
    100 
    101  protected:
    102 
    103  /**
    104   * <p>Set the character set handled by this engine.</p>
    105   *
    106   * @param set A UnicodeSet of the set of characters handled by the engine
    107   */
    108   virtual void setCharacters( const UnicodeSet &set );
    109 
    110  /**
    111   * <p>Set the break types handled by this engine.</p>
    112   *
    113   * @param breakTypes A bitmap of types handled by the engine.
    114   */
    115 //  virtual void setBreakTypes( uint32_t breakTypes );
    116 
    117  /**
    118   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    119   *
    120   * @param text A UText representing the text
    121   * @param rangeStart The start of the range of dictionary characters
    122   * @param rangeEnd The end of the range of dictionary characters
    123   * @param foundBreaks Output of C array of int32_t break positions, or 0
    124   * @return The number of breaks found
    125   */
    126   virtual int32_t divideUpDictionaryRange( UText *text,
    127                                            int32_t rangeStart,
    128                                            int32_t rangeEnd,
    129                                            UVector32 &foundBreaks ) const = 0;
    130 
    131 };
    132 
    133 /*******************************************************************
    134  * ThaiBreakEngine
    135  */
    136 
    137 /**
    138  * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
    139  * dictionary and heuristics to determine Thai-specific breaks.</p>
    140  *
    141  * <p>After it is constructed a ThaiBreakEngine may be shared between
    142  * threads without synchronization.</p>
    143  */
    144 class ThaiBreakEngine : public DictionaryBreakEngine {
    145  private:
    146     /**
    147      * The set of characters handled by this engine
    148      * @internal
    149      */
    150 
    151   UnicodeSet                fThaiWordSet;
    152   UnicodeSet                fEndWordSet;
    153   UnicodeSet                fBeginWordSet;
    154   UnicodeSet                fSuffixSet;
    155   UnicodeSet                fMarkSet;
    156   DictionaryMatcher  *fDictionary;
    157 
    158  public:
    159 
    160   /**
    161    * <p>Default constructor.</p>
    162    *
    163    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    164    * engine is deleted.
    165    */
    166   ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    167 
    168   /**
    169    * <p>Virtual destructor.</p>
    170    */
    171   virtual ~ThaiBreakEngine();
    172 
    173  protected:
    174  /**
    175   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    176   *
    177   * @param text A UText representing the text
    178   * @param rangeStart The start of the range of dictionary characters
    179   * @param rangeEnd The end of the range of dictionary characters
    180   * @param foundBreaks Output of C array of int32_t break positions, or 0
    181   * @return The number of breaks found
    182   */
    183   virtual int32_t divideUpDictionaryRange( UText *text,
    184                                            int32_t rangeStart,
    185                                            int32_t rangeEnd,
    186                                            UVector32 &foundBreaks ) const;
    187 
    188 };
    189 
    190 /*******************************************************************
    191  * LaoBreakEngine
    192  */
    193 
    194 /**
    195  * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
    196  * dictionary and heuristics to determine Lao-specific breaks.</p>
    197  *
    198  * <p>After it is constructed a LaoBreakEngine may be shared between
    199  * threads without synchronization.</p>
    200  */
    201 class LaoBreakEngine : public DictionaryBreakEngine {
    202  private:
    203     /**
    204      * The set of characters handled by this engine
    205      * @internal
    206      */
    207 
    208   UnicodeSet                fLaoWordSet;
    209   UnicodeSet                fEndWordSet;
    210   UnicodeSet                fBeginWordSet;
    211   UnicodeSet                fMarkSet;
    212   DictionaryMatcher  *fDictionary;
    213 
    214  public:
    215 
    216   /**
    217    * <p>Default constructor.</p>
    218    *
    219    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    220    * engine is deleted.
    221    */
    222   LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    223 
    224   /**
    225    * <p>Virtual destructor.</p>
    226    */
    227   virtual ~LaoBreakEngine();
    228 
    229  protected:
    230  /**
    231   * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    232   *
    233   * @param text A UText representing the text
    234   * @param rangeStart The start of the range of dictionary characters
    235   * @param rangeEnd The end of the range of dictionary characters
    236   * @param foundBreaks Output of C array of int32_t break positions, or 0
    237   * @return The number of breaks found
    238   */
    239   virtual int32_t divideUpDictionaryRange( UText *text,
    240                                            int32_t rangeStart,
    241                                            int32_t rangeEnd,
    242                                            UVector32 &foundBreaks ) const;
    243 
    244 };
    245 
    246 /*******************************************************************
    247  * BurmeseBreakEngine
    248  */
    249 
    250 /**
    251  * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
    252  * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
    253  *
    254  * <p>After it is constructed a BurmeseBreakEngine may be shared between
    255  * threads without synchronization.</p>
    256  */
    257 class BurmeseBreakEngine : public DictionaryBreakEngine {
    258  private:
    259     /**
    260      * The set of characters handled by this engine
    261      * @internal
    262      */
    263 
    264   UnicodeSet                fBurmeseWordSet;
    265   UnicodeSet                fEndWordSet;
    266   UnicodeSet                fBeginWordSet;
    267   UnicodeSet                fMarkSet;
    268   DictionaryMatcher  *fDictionary;
    269 
    270  public:
    271 
    272   /**
    273    * <p>Default constructor.</p>
    274    *
    275    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    276    * engine is deleted.
    277    */
    278   BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    279 
    280   /**
    281    * <p>Virtual destructor.</p>
    282    */
    283   virtual ~BurmeseBreakEngine();
    284 
    285  protected:
    286  /**
    287   * <p>Divide up a range of known dictionary characters.</p>
    288   *
    289   * @param text A UText representing the text
    290   * @param rangeStart The start of the range of dictionary characters
    291   * @param rangeEnd The end of the range of dictionary characters
    292   * @param foundBreaks Output of C array of int32_t break positions, or 0
    293   * @return The number of breaks found
    294   */
    295   virtual int32_t divideUpDictionaryRange( UText *text,
    296                                            int32_t rangeStart,
    297                                            int32_t rangeEnd,
    298                                            UVector32 &foundBreaks ) const;
    299 
    300 };
    301 
    302 /*******************************************************************
    303  * KhmerBreakEngine
    304  */
    305 
    306 /**
    307  * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
    308  * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
    309  *
    310  * <p>After it is constructed a KhmerBreakEngine may be shared between
    311  * threads without synchronization.</p>
    312  */
    313 class KhmerBreakEngine : public DictionaryBreakEngine {
    314  private:
    315     /**
    316      * The set of characters handled by this engine
    317      * @internal
    318      */
    319 
    320   UnicodeSet                fKhmerWordSet;
    321   UnicodeSet                fEndWordSet;
    322   UnicodeSet                fBeginWordSet;
    323   UnicodeSet                fMarkSet;
    324   DictionaryMatcher  *fDictionary;
    325 
    326  public:
    327 
    328   /**
    329    * <p>Default constructor.</p>
    330    *
    331    * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    332    * engine is deleted.
    333    */
    334   KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
    335 
    336   /**
    337    * <p>Virtual destructor.</p>
    338    */
    339   virtual ~KhmerBreakEngine();
    340 
    341  protected:
    342  /**
    343   * <p>Divide up a range of known dictionary characters.</p>
    344   *
    345   * @param text A UText representing the text
    346   * @param rangeStart The start of the range of dictionary characters
    347   * @param rangeEnd The end of the range of dictionary characters
    348   * @param foundBreaks Output of C array of int32_t break positions, or 0
    349   * @return The number of breaks found
    350   */
    351   virtual int32_t divideUpDictionaryRange( UText *text,
    352                                            int32_t rangeStart,
    353                                            int32_t rangeEnd,
    354                                            UVector32 &foundBreaks ) const;
    355 
    356 };
    357 
    358 #if !UCONFIG_NO_NORMALIZATION
    359 
    360 /*******************************************************************
    361  * CjkBreakEngine
    362  */
    363 
    364 //indicates language/script that the CjkBreakEngine will handle
    365 enum LanguageType {
    366     kKorean,
    367     kChineseJapanese
    368 };
    369 
    370 /**
    371  * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
    372  * dictionary with costs associated with each word and
    373  * Viterbi decoding to determine CJK-specific breaks.</p>
    374  */
    375 class CjkBreakEngine : public DictionaryBreakEngine {
    376  protected:
    377     /**
    378      * The set of characters handled by this engine
    379      * @internal
    380      */
    381   UnicodeSet                fHangulWordSet;
    382   UnicodeSet                fHanWordSet;
    383   UnicodeSet                fKatakanaWordSet;
    384   UnicodeSet                fHiraganaWordSet;
    385 
    386   DictionaryMatcher        *fDictionary;
    387   const Normalizer2        *nfkcNorm2;
    388 
    389  public:
    390 
    391     /**
    392      * <p>Default constructor.</p>
    393      *
    394      * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
    395      * engine is deleted. The DictionaryMatcher must contain costs for each word
    396      * in order for the dictionary to work properly.
    397      */
    398   CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
    399 
    400     /**
    401      * <p>Virtual destructor.</p>
    402      */
    403   virtual ~CjkBreakEngine();
    404 
    405  protected:
    406     /**
    407      * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
    408      *
    409      * @param text A UText representing the text
    410      * @param rangeStart The start of the range of dictionary characters
    411      * @param rangeEnd The end of the range of dictionary characters
    412      * @param foundBreaks Output of C array of int32_t break positions, or 0
    413      * @return The number of breaks found
    414      */
    415   virtual int32_t divideUpDictionaryRange( UText *text,
    416           int32_t rangeStart,
    417           int32_t rangeEnd,
    418           UVector32 &foundBreaks ) const;
    419 
    420 };
    421 
    422 #endif
    423 
    424 U_NAMESPACE_END
    425 
    426     /* DICTBE_H */
    427 #endif
    428