Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  **********************************************************************
      5  *   Copyright (C) 2005-2012, International Business Machines
      6  *   Corporation and others.  All Rights Reserved.
      7  **********************************************************************
      8  */
      9 
     10 #ifndef __CSRMBCS_H
     11 #define __CSRMBCS_H
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_CONVERSION
     16 
     17 #include "csrecog.h"
     18 
     19 U_NAMESPACE_BEGIN
     20 
     21 // "Character"  iterated character class.
     22 //    Recognizers for specific mbcs encodings make their "characters" available
     23 //    by providing a nextChar() function that fills in an instance of IteratedChar
     24 //    with the next char from the input.
     25 //    The returned characters are not converted to Unicode, but remain as the raw
     26 //    bytes (concatenated into an int) from the codepage data.
     27 //
     28 //  For Asian charsets, use the raw input rather than the input that has been
     29 //   stripped of markup.  Detection only considers multi-byte chars, effectively
     30 //   stripping markup anyway, and double byte chars do occur in markup too.
     31 //
     32 class IteratedChar : public UMemory
     33 {
     34 public:
     35     uint32_t charValue;             // 1-4 bytes from the raw input data
     36     int32_t  index;
     37     int32_t  nextIndex;
     38     UBool    error;
     39     UBool    done;
     40 
     41 public:
     42     IteratedChar();
     43     //void reset();
     44     int32_t nextByte(InputText* det);
     45 };
     46 
     47 
     48 class CharsetRecog_mbcs : public CharsetRecognizer {
     49 
     50 protected:
     51     /**
     52      * Test the match of this charset with the input text data
     53      *      which is obtained via the CharsetDetector object.
     54      *
     55      * @param det  The CharsetDetector, which contains the input text
     56      *             to be checked for being in this charset.
     57      * @return     Two values packed into one int  (Damn java, anyhow)
     58      *             <br/>
     59      *             bits 0-7:  the match confidence, ranging from 0-100
     60      *             <br/>
     61      *             bits 8-15: The match reason, an enum-like value.
     62      */
     63     int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const;
     64 
     65 public:
     66 
     67     virtual ~CharsetRecog_mbcs();
     68 
     69     /**
     70      * Get the IANA name of this charset.
     71      * @return the charset name.
     72      */
     73 
     74     const char *getName() const = 0;
     75     const char *getLanguage() const = 0;
     76     UBool match(InputText* input, CharsetMatch *results) const = 0;
     77 
     78     /**
     79      * Get the next character (however many bytes it is) from the input data
     80      *    Subclasses for specific charset encodings must implement this function
     81      *    to get characters according to the rules of their encoding scheme.
     82      *
     83      *  This function is not a method of class IteratedChar only because
     84      *   that would require a lot of extra derived classes, which is awkward.
     85      * @param it  The IteratedChar "struct" into which the returned char is placed.
     86      * @param det The charset detector, which is needed to get at the input byte data
     87      *            being iterated over.
     88      * @return    True if a character was returned, false at end of input.
     89      */
     90     virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0;
     91 
     92 };
     93 
     94 
     95 /**
     96  *   Shift-JIS charset recognizer.
     97  *
     98  */
     99 class CharsetRecog_sjis : public CharsetRecog_mbcs {
    100 public:
    101     virtual ~CharsetRecog_sjis();
    102 
    103     UBool nextChar(IteratedChar *it, InputText *det) const;
    104 
    105     UBool match(InputText* input, CharsetMatch *results) const;
    106 
    107     const char *getName() const;
    108     const char *getLanguage() const;
    109 
    110 };
    111 
    112 
    113 /**
    114  *   EUC charset recognizers.  One abstract class that provides the common function
    115  *             for getting the next character according to the EUC encoding scheme,
    116  *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
    117  *
    118  */
    119 class CharsetRecog_euc : public CharsetRecog_mbcs
    120 {
    121 public:
    122     virtual ~CharsetRecog_euc();
    123 
    124     const char *getName() const = 0;
    125     const char *getLanguage() const = 0;
    126 
    127     UBool match(InputText* input, CharsetMatch *results) const = 0;
    128     /*
    129      *  (non-Javadoc)
    130      *  Get the next character value for EUC based encodings.
    131      *  Character "value" is simply the raw bytes that make up the character
    132      *     packed into an int.
    133      */
    134     UBool nextChar(IteratedChar *it, InputText *det) const;
    135 };
    136 
    137 /**
    138  * The charset recognize for EUC-JP.  A singleton instance of this class
    139  *    is created and kept by the public CharsetDetector class
    140  */
    141 class CharsetRecog_euc_jp : public CharsetRecog_euc
    142 {
    143 public:
    144     virtual ~CharsetRecog_euc_jp();
    145 
    146     const char *getName() const;
    147     const char *getLanguage() const;
    148 
    149     UBool match(InputText* input, CharsetMatch *results) const;
    150 };
    151 
    152 /**
    153  * The charset recognize for EUC-KR.  A singleton instance of this class
    154  *    is created and kept by the public CharsetDetector class
    155  */
    156 class CharsetRecog_euc_kr : public CharsetRecog_euc
    157 {
    158 public:
    159     virtual ~CharsetRecog_euc_kr();
    160 
    161     const char *getName() const;
    162     const char *getLanguage() const;
    163 
    164     UBool match(InputText* input, CharsetMatch *results) const;
    165 };
    166 
    167 /**
    168  *
    169  *   Big5 charset recognizer.
    170  *
    171  */
    172 class CharsetRecog_big5 : public CharsetRecog_mbcs
    173 {
    174 public:
    175     virtual ~CharsetRecog_big5();
    176 
    177     UBool nextChar(IteratedChar* it, InputText* det) const;
    178 
    179     const char *getName() const;
    180     const char *getLanguage() const;
    181 
    182     UBool match(InputText* input, CharsetMatch *results) const;
    183 };
    184 
    185 
    186 /**
    187  *
    188  *   GB-18030 recognizer. Uses simplified Chinese statistics.
    189  *
    190  */
    191 class CharsetRecog_gb_18030 : public CharsetRecog_mbcs
    192 {
    193 public:
    194     virtual ~CharsetRecog_gb_18030();
    195 
    196     UBool nextChar(IteratedChar* it, InputText* det) const;
    197 
    198     const char *getName() const;
    199     const char *getLanguage() const;
    200 
    201     UBool match(InputText* input, CharsetMatch *results) const;
    202 };
    203 
    204 U_NAMESPACE_END
    205 
    206 #endif
    207 #endif /* __CSRMBCS_H */
    208