Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2012-2014, International Business Machines
      4 * Corporation and others.  All Rights Reserved.
      5 *******************************************************************************
      6 * utf8collationiterator.h
      7 *
      8 * created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
      9 * created by: Markus W. Scherer
     10 */
     11 
     12 #ifndef __UTF8COLLATIONITERATOR_H__
     13 #define __UTF8COLLATIONITERATOR_H__
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if !UCONFIG_NO_COLLATION
     18 
     19 #include "cmemory.h"
     20 #include "collation.h"
     21 #include "collationdata.h"
     22 #include "normalizer2impl.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 /**
     27  * UTF-8 collation element and character iterator.
     28  * Handles normalized UTF-8 text inline, with length or NUL-terminated.
     29  * Unnormalized text is handled by a subclass.
     30  */
     31 class U_I18N_API UTF8CollationIterator : public CollationIterator {
     32 public:
     33     UTF8CollationIterator(const CollationData *d, UBool numeric,
     34                           const uint8_t *s, int32_t p, int32_t len)
     35             : CollationIterator(d, numeric),
     36               u8(s), pos(p), length(len) {}
     37 
     38     virtual ~UTF8CollationIterator();
     39 
     40     virtual void resetToOffset(int32_t newOffset);
     41 
     42     virtual int32_t getOffset() const;
     43 
     44     virtual UChar32 nextCodePoint(UErrorCode &errorCode);
     45 
     46     virtual UChar32 previousCodePoint(UErrorCode &errorCode);
     47 
     48 protected:
     49     /**
     50      * For byte sequences that are illegal in UTF-8, an error value may be returned
     51      * together with a bogus code point. The caller will ignore that code point.
     52      *
     53      * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
     54      * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns TRUE.
     55      *
     56      * Valid lead surrogates are returned from inside a normalized text segment,
     57      * where handleGetTrailSurrogate() will return the matching trail surrogate.
     58      */
     59     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
     60 
     61     virtual UBool foundNULTerminator();
     62 
     63     virtual UBool forbidSurrogateCodePoints() const;
     64 
     65     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
     66 
     67     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
     68 
     69     const uint8_t *u8;
     70     int32_t pos;
     71     int32_t length;  // <0 for NUL-terminated strings
     72 };
     73 
     74 /**
     75  * Incrementally checks the input text for FCD and normalizes where necessary.
     76  */
     77 class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
     78 public:
     79     FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
     80                              const uint8_t *s, int32_t p, int32_t len)
     81             : UTF8CollationIterator(data, numeric, s, p, len),
     82               state(CHECK_FWD), start(p),
     83               nfcImpl(data->nfcImpl) {}
     84 
     85     virtual ~FCDUTF8CollationIterator();
     86 
     87     virtual void resetToOffset(int32_t newOffset);
     88 
     89     virtual int32_t getOffset() const;
     90 
     91     virtual UChar32 nextCodePoint(UErrorCode &errorCode);
     92 
     93     virtual UChar32 previousCodePoint(UErrorCode &errorCode);
     94 
     95 protected:
     96     virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode);
     97 
     98     virtual UChar handleGetTrailSurrogate();
     99 
    100     virtual UBool foundNULTerminator();
    101 
    102     virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode);
    103 
    104     virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode);
    105 
    106 private:
    107     UBool nextHasLccc() const;
    108     UBool previousHasTccc() const;
    109 
    110     /**
    111      * Switches to forward checking if possible.
    112      */
    113     void switchToForward();
    114 
    115     /**
    116      * Extends the FCD text segment forward or normalizes around pos.
    117      * @return TRUE if success
    118      */
    119     UBool nextSegment(UErrorCode &errorCode);
    120 
    121     /**
    122      * Switches to backward checking.
    123      */
    124     void switchToBackward();
    125 
    126     /**
    127      * Extends the FCD text segment backward or normalizes around pos.
    128      * @return TRUE if success
    129      */
    130     UBool previousSegment(UErrorCode &errorCode);
    131 
    132     UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
    133 
    134     enum State {
    135         /**
    136          * The input text [start..pos[ passes the FCD check.
    137          * Moving forward checks incrementally.
    138          * limit is undefined.
    139          */
    140         CHECK_FWD,
    141         /**
    142          * The input text [pos..limit[ passes the FCD check.
    143          * Moving backward checks incrementally.
    144          * start is undefined.
    145          */
    146         CHECK_BWD,
    147         /**
    148          * The input text [start..limit[ passes the FCD check.
    149          * pos tracks the current text index.
    150          */
    151         IN_FCD_SEGMENT,
    152         /**
    153          * The input text [start..limit[ failed the FCD check and was normalized.
    154          * pos tracks the current index in the normalized string.
    155          */
    156         IN_NORMALIZED
    157     };
    158 
    159     State state;
    160 
    161     int32_t start;
    162     int32_t limit;
    163 
    164     const Normalizer2Impl &nfcImpl;
    165     UnicodeString normalized;
    166 };
    167 
    168 U_NAMESPACE_END
    169 
    170 #endif  // !UCONFIG_NO_COLLATION
    171 #endif  // __UTF8COLLATIONITERATOR_H__
    172