Home | History | Annotate | Download | only in win
      1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
      6 #define ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
      7 
      8 #if !defined(CLD_WINDOWS)
      9 
     10 #include "util/utf8/utf8statetable.h"
     11 
     12 #else
     13 
     14 #include "encodings/compact_lang_det/win/cld_basictypes.h"
     15 
     16 // These four-byte entries compactly encode how many bytes 0..255 to delete
     17 // in making a string replacement, how many bytes to add 0..255, and the offset
     18 // 0..64k-1 of the replacement string in remap_string.
     19 struct RemapEntry {
     20   uint8 delete_bytes;
     21   uint8 add_bytes;
     22   uint16 bytes_offset;
     23 };
     24 
     25 // Exit type codes for state tables. All but the first get stuffed into
     26 // signed one-byte entries. The first is only generated by executable code.
     27 // To distinguish from next-state entries, these must be contiguous and
     28 // all <= kExitNone
     29 typedef enum {
     30   kExitDstSpaceFull = 239,
     31   kExitIllegalStructure,  // 240
     32   kExitOK,                // 241
     33   kExitReject,            // ...
     34   kExitReplace1,
     35   kExitReplace2,
     36   kExitReplace3,
     37   kExitReplace21,
     38   kExitReplace31,
     39   kExitReplace32,
     40   kExitReplaceOffset1,
     41   kExitReplaceOffset2,
     42   kExitReplace1S0,
     43   kExitSpecial,
     44   kExitDoAgain,
     45   kExitRejectAlt,
     46   kExitNone               // 255
     47 } ExitReason;
     48 
     49 typedef enum {
     50   kExitDstSpaceFull_2 = -32769,
     51   kExitIllegalStructure_2,  // -32768
     52   kExitOK_2,                // -32767
     53   kExitReject_2,            // ...
     54   kExitReplace1_2,
     55   kExitReplace2_2,
     56   kExitReplace3_2,
     57   kExitReplace21_2,
     58   kExitReplace31_2,
     59   kExitReplace32_2,
     60   kExitReplaceOffset1_2,
     61   kExitReplaceOffset2_2,
     62   kExitReplace1S0_2,
     63   kExitSpecial_2,
     64   kExitDoAgain_2,
     65   kExitRejectAlt_2,
     66   kExitNone_2               // -32753
     67 } ExitReason_2;
     68 
     69 // This struct represents one entire state table. The three initialized byte
     70 // areas are state_table, remap_base, and remap_string. state0 and state0_size
     71 // give the byte offset and length within state_table of the initial state --
     72 // table lookups are expected to start and end in this state, but for
     73 // truncated UTF-8 strings, may end in a different state. These allow a quick
     74 // test for that condition. entry_shift is 8 for tables subscripted by a full
     75 // byte value and 6 for space-optimized tables subscripted by only six
     76 // significant bits in UTF-8 continuation bytes.
     77 typedef struct {
     78   const uint32 state0;
     79   const uint32 state0_size;
     80   const uint32 total_size;
     81   const int max_expand;
     82   const int entry_shift;
     83   const int bytes_per_entry;
     84   const uint32 losub;
     85   const uint32 hiadd;
     86   const uint8* state_table;
     87   const RemapEntry* remap_base;
     88   const uint8* remap_string;
     89   const uint8* fast_state;
     90 } UTF8StateMachineObj;
     91 
     92 // Near-duplicate declaration for tables with two-byte entries
     93 typedef struct {
     94   const uint32 state0;
     95   const uint32 state0_size;
     96   const uint32 total_size;
     97   const int max_expand;
     98   const int entry_shift;
     99   const int bytes_per_entry;
    100   const uint32 losub;
    101   const uint32 hiadd;
    102   const signed short* state_table;
    103   const RemapEntry* remap_base;
    104   const uint8* remap_string;
    105   const uint8* fast_state;
    106 } UTF8StateMachineObj_2;
    107 
    108 
    109 typedef UTF8StateMachineObj UTF8PropObj;
    110 typedef UTF8StateMachineObj UTF8ScanObj;
    111 typedef UTF8StateMachineObj_2 UTF8PropObj_2;
    112 
    113 
    114 // Look up property of one UTF-8 character and advance over it
    115 // Return 0 if input length is zero
    116 // Return 0 and advance one byte if input is ill-formed
    117 uint8 UTF8GenericProperty(const UTF8PropObj* st,
    118                           const uint8** src,
    119                           int* srclen);
    120 
    121 // BigOneByte versions are needed for tables > 240 states, but most
    122 // won't need the TwoByte versions.
    123 
    124 // Look up property of one UTF-8 character and advance over it
    125 // Return 0 if input length is zero
    126 // Return 0 and advance one byte if input is ill-formed
    127 uint8 UTF8GenericPropertyBigOneByte(const UTF8PropObj* st,
    128                           const uint8** src,
    129                           int* srclen);
    130 
    131 // Scan a UTF-8 stringpiece based on a state table.
    132 // Always scan complete UTF-8 characters
    133 // Set number of bytes scanned. Return reason for exiting
    134 int UTF8GenericScan(const UTF8ScanObj* st,
    135                     const uint8* str,
    136                     const int len,
    137                     int* bytes_consumed);
    138 
    139 #endif
    140 
    141 #endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UTF8STATETABLE_H_
    142