Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2008, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 #ifndef __CSRMBCS_H
      9 #define __CSRMBCS_H
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_CONVERSION
     14 
     15 #include "csrecog.h"
     16 
     17 U_NAMESPACE_BEGIN
     18 
     19 // "Character"  iterated character class.
     20 //    Recognizers for specific mbcs encodings make their "characters" available
     21 //    by providing a nextChar() function that fills in an instance of IteratedChar
     22 //    with the next char from the input.
     23 //    The returned characters are not converted to Unicode, but remain as the raw
     24 //    bytes (concatenated into an int) from the codepage data.
     25 //
     26 //  For Asian charsets, use the raw input rather than the input that has been
     27 //   stripped of markup.  Detection only considers multi-byte chars, effectively
     28 //   stripping markup anyway, and double byte chars do occur in markup too.
     29 //
     30 class IteratedChar : public UMemory
     31 {
     32 public:
     33     uint32_t charValue;             // 1-4 bytes from the raw input data
     34     int32_t  index;
     35     int32_t  nextIndex;
     36     UBool    error;
     37     UBool    done;
     38 
     39 public:
     40     IteratedChar();
     41     //void reset();
     42     int32_t nextByte(InputText* det);
     43 };
     44 
     45 
     46 class CharsetRecog_mbcs : public CharsetRecognizer {
     47 
     48 protected:
     49     /**
     50      * Test the match of this charset with the input text data
     51      *      which is obtained via the CharsetDetector object.
     52      *
     53      * @param det  The CharsetDetector, which contains the input text
     54      *             to be checked for being in this charset.
     55      * @return     Two values packed into one int  (Damn java, anyhow)
     56      *             <br/>
     57      *             bits 0-7:  the match confidence, ranging from 0-100
     58      *             <br/>
     59      *             bits 8-15: The match reason, an enum-like value.
     60      */
     61     int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen);
     62 
     63 public:
     64 
     65     virtual ~CharsetRecog_mbcs();
     66 
     67     /**
     68      * Get the IANA name of this charset.
     69      * @return the charset name.
     70      */
     71 
     72     const char *getName() const = 0;
     73     const char *getLanguage() const = 0;
     74     int32_t match(InputText* det) = 0;
     75 
     76     /**
     77      * Get the next character (however many bytes it is) from the input data
     78      *    Subclasses for specific charset encodings must implement this function
     79      *    to get characters according to the rules of their encoding scheme.
     80      *
     81      *  This function is not a method of class IteratedChar only because
     82      *   that would require a lot of extra derived classes, which is awkward.
     83      * @param it  The IteratedChar "struct" into which the returned char is placed.
     84      * @param det The charset detector, which is needed to get at the input byte data
     85      *            being iterated over.
     86      * @return    True if a character was returned, false at end of input.
     87      */
     88     virtual UBool nextChar(IteratedChar *it, InputText *textIn) = 0;
     89 
     90 };
     91 
     92 
     93 /**
     94  *   Shift-JIS charset recognizer.
     95  *
     96  */
     97 class CharsetRecog_sjis : public CharsetRecog_mbcs {
     98 public:
     99     virtual ~CharsetRecog_sjis();
    100 
    101     UBool nextChar(IteratedChar *it, InputText *det);
    102 
    103     int32_t match(InputText *det);
    104 
    105     const char *getName() const;
    106     const char *getLanguage() const;
    107 
    108 };
    109 
    110 
    111 /**
    112  *   EUC charset recognizers.  One abstract class that provides the common function
    113  *             for getting the next character according to the EUC encoding scheme,
    114  *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
    115  *
    116  */
    117 class CharsetRecog_euc : public CharsetRecog_mbcs
    118 {
    119 public:
    120     virtual ~CharsetRecog_euc();
    121 
    122     const char *getName() const = 0;
    123     const char *getLanguage() const = 0;
    124 
    125     int32_t match(InputText* det) = 0;
    126     /*
    127      *  (non-Javadoc)
    128      *  Get the next character value for EUC based encodings.
    129      *  Character "value" is simply the raw bytes that make up the character
    130      *     packed into an int.
    131      */
    132     UBool nextChar(IteratedChar *it, InputText *det);
    133 };
    134 
    135 /**
    136  * The charset recognize for EUC-JP.  A singleton instance of this class
    137  *    is created and kept by the public CharsetDetector class
    138  */
    139 class CharsetRecog_euc_jp : public CharsetRecog_euc
    140 {
    141 public:
    142     virtual ~CharsetRecog_euc_jp();
    143 
    144     const char *getName() const;
    145     const char *getLanguage() const;
    146 
    147     int32_t match(InputText *det);
    148 };
    149 
    150 /**
    151  * The charset recognize for EUC-KR.  A singleton instance of this class
    152  *    is created and kept by the public CharsetDetector class
    153  */
    154 class CharsetRecog_euc_kr : public CharsetRecog_euc
    155 {
    156 public:
    157     virtual ~CharsetRecog_euc_kr();
    158 
    159     const char *getName() const;
    160     const char *getLanguage() const;
    161 
    162     int32_t match(InputText *det);
    163 };
    164 
    165 /**
    166  *
    167  *   Big5 charset recognizer.
    168  *
    169  */
    170 class CharsetRecog_big5 : public CharsetRecog_mbcs
    171 {
    172 public:
    173     virtual ~CharsetRecog_big5();
    174 
    175     UBool nextChar(IteratedChar* it, InputText* det);
    176 
    177     const char *getName() const;
    178     const char *getLanguage() const;
    179 
    180     int32_t match(InputText *det);
    181 };
    182 
    183 
    184 /**
    185  *
    186  *   GB-18030 recognizer. Uses simplified Chinese statistics.
    187  *
    188  */
    189 class CharsetRecog_gb_18030 : public CharsetRecog_mbcs
    190 {
    191 public:
    192     virtual ~CharsetRecog_gb_18030();
    193 
    194     UBool nextChar(IteratedChar* it, InputText* det);
    195 
    196     const char *getName() const;
    197     const char *getLanguage() const;
    198 
    199     int32_t match(InputText *det);
    200 };
    201 
    202 U_NAMESPACE_END
    203 
    204 #endif
    205 #endif /* __CSRMBCS_H */
    206