Home | History | Annotate | Download | only in src
      1 #include "vterm_internal.h"
      2 
      3 #define UNICODE_INVALID 0xFFFD
      4 
      5 #if defined(DEBUG) && DEBUG > 1
      6 # define DEBUG_PRINT_UTF8
      7 #endif
      8 
      9 struct UTF8DecoderData {
     10   // number of bytes remaining in this codepoint
     11   int bytes_remaining;
     12 
     13   // number of bytes total in this codepoint once it's finished
     14   // (for detecting overlongs)
     15   int bytes_total;
     16 
     17   int this_cp;
     18 };
     19 
     20 static void init_utf8(VTermEncoding *enc, void *data_)
     21 {
     22   struct UTF8DecoderData *data = data_;
     23 
     24   data->bytes_remaining = 0;
     25   data->bytes_total     = 0;
     26 }
     27 
     28 static void decode_utf8(VTermEncoding *enc, void *data_,
     29                         uint32_t cp[], int *cpi, int cplen,
     30                         const char bytes[], size_t *pos, size_t bytelen)
     31 {
     32   struct UTF8DecoderData *data = data_;
     33 
     34 #ifdef DEBUG_PRINT_UTF8
     35   printf("BEGIN UTF-8\n");
     36 #endif
     37 
     38   for(; *pos < bytelen && *cpi < cplen; (*pos)++) {
     39     unsigned char c = bytes[*pos];
     40 
     41 #ifdef DEBUG_PRINT_UTF8
     42     printf(" pos=%zd c=%02x rem=%d\n", *pos, c, data->bytes_remaining);
     43 #endif
     44 
     45     if(c < 0x20)
     46       return;
     47 
     48     else if(c >= 0x20 && c < 0x80) {
     49       if(data->bytes_remaining)
     50         cp[(*cpi)++] = UNICODE_INVALID;
     51 
     52       cp[(*cpi)++] = c;
     53 #ifdef DEBUG_PRINT_UTF8
     54       printf(" UTF-8 char: U+%04x\n", c);
     55 #endif
     56       data->bytes_remaining = 0;
     57     }
     58 
     59     else if(c >= 0x80 && c < 0xc0) {
     60       if(!data->bytes_remaining) {
     61         cp[(*cpi)++] = UNICODE_INVALID;
     62         continue;
     63       }
     64 
     65       data->this_cp <<= 6;
     66       data->this_cp |= c & 0x3f;
     67       data->bytes_remaining--;
     68 
     69       if(!data->bytes_remaining) {
     70 #ifdef DEBUG_PRINT_UTF8
     71         printf(" UTF-8 raw char U+%04x bytelen=%d ", data->this_cp, data->bytes_total);
     72 #endif
     73         // Check for overlong sequences
     74         switch(data->bytes_total) {
     75         case 2:
     76           if(data->this_cp <  0x0080) data->this_cp = UNICODE_INVALID; break;
     77         case 3:
     78           if(data->this_cp <  0x0800) data->this_cp = UNICODE_INVALID; break;
     79         case 4:
     80           if(data->this_cp < 0x10000) data->this_cp = UNICODE_INVALID; break;
     81         case 5:
     82           if(data->this_cp < 0x200000) data->this_cp = UNICODE_INVALID; break;
     83         case 6:
     84           if(data->this_cp < 0x4000000) data->this_cp = UNICODE_INVALID; break;
     85         }
     86         // Now look for plain invalid ones
     87         if((data->this_cp >= 0xD800 && data->this_cp <= 0xDFFF) ||
     88            data->this_cp == 0xFFFE ||
     89            data->this_cp == 0xFFFF)
     90           data->this_cp = UNICODE_INVALID;
     91 #ifdef DEBUG_PRINT_UTF8
     92         printf(" char: U+%04x\n", data->this_cp);
     93 #endif
     94         cp[(*cpi)++] = data->this_cp;
     95       }
     96     }
     97 
     98     else if(c >= 0xc0 && c < 0xe0) {
     99       if(data->bytes_remaining)
    100         cp[(*cpi)++] = UNICODE_INVALID;
    101 
    102       data->this_cp = c & 0x1f;
    103       data->bytes_total = 2;
    104       data->bytes_remaining = 1;
    105     }
    106 
    107     else if(c >= 0xe0 && c < 0xf0) {
    108       if(data->bytes_remaining)
    109         cp[(*cpi)++] = UNICODE_INVALID;
    110 
    111       data->this_cp = c & 0x0f;
    112       data->bytes_total = 3;
    113       data->bytes_remaining = 2;
    114     }
    115 
    116     else if(c >= 0xf0 && c < 0xf8) {
    117       if(data->bytes_remaining)
    118         cp[(*cpi)++] = UNICODE_INVALID;
    119 
    120       data->this_cp = c & 0x07;
    121       data->bytes_total = 4;
    122       data->bytes_remaining = 3;
    123     }
    124 
    125     else if(c >= 0xf8 && c < 0xfc) {
    126       if(data->bytes_remaining)
    127         cp[(*cpi)++] = UNICODE_INVALID;
    128 
    129       data->this_cp = c & 0x03;
    130       data->bytes_total = 5;
    131       data->bytes_remaining = 4;
    132     }
    133 
    134     else if(c >= 0xfc && c < 0xfe) {
    135       if(data->bytes_remaining)
    136         cp[(*cpi)++] = UNICODE_INVALID;
    137 
    138       data->this_cp = c & 0x01;
    139       data->bytes_total = 6;
    140       data->bytes_remaining = 5;
    141     }
    142 
    143     else {
    144       cp[(*cpi)++] = UNICODE_INVALID;
    145     }
    146   }
    147 }
    148 
    149 static VTermEncoding encoding_utf8 = {
    150   .init   = &init_utf8,
    151   .decode = &decode_utf8,
    152 };
    153 
    154 static void decode_usascii(VTermEncoding *enc, void *data,
    155                            uint32_t cp[], int *cpi, int cplen,
    156                            const char bytes[], size_t *pos, size_t bytelen)
    157 {
    158   int is_gr = bytes[*pos] & 0x80;
    159 
    160   for(; *pos < bytelen && *cpi < cplen; (*pos)++) {
    161     unsigned char c = bytes[*pos] ^ is_gr;
    162 
    163     if(c < 0x20 || c >= 0x80)
    164       return;
    165 
    166     cp[(*cpi)++] = c;
    167   }
    168 }
    169 
    170 static VTermEncoding encoding_usascii = {
    171   .decode = &decode_usascii,
    172 };
    173 
    174 struct StaticTableEncoding {
    175   const VTermEncoding enc;
    176   const uint32_t chars[128];
    177 };
    178 
    179 static void decode_table(VTermEncoding *enc, void *data,
    180                          uint32_t cp[], int *cpi, int cplen,
    181                          const char bytes[], size_t *pos, size_t bytelen)
    182 {
    183   struct StaticTableEncoding *table = (struct StaticTableEncoding *)enc;
    184   int is_gr = bytes[*pos] & 0x80;
    185 
    186   for(; *pos < bytelen && *cpi < cplen; (*pos)++) {
    187     unsigned char c = bytes[*pos] ^ is_gr;
    188 
    189     if(c < 0x20 || c >= 0x80)
    190       return;
    191 
    192     if(table->chars[c])
    193       cp[(*cpi)++] = table->chars[c];
    194     else
    195       cp[(*cpi)++] = c;
    196   }
    197 }
    198 
    199 #include "encoding/DECdrawing.inc"
    200 #include "encoding/uk.inc"
    201 
    202 static struct {
    203   VTermEncodingType type;
    204   char designation;
    205   VTermEncoding *enc;
    206 }
    207 encodings[] = {
    208   { ENC_UTF8,      'u', &encoding_utf8 },
    209   { ENC_SINGLE_94, '0', (VTermEncoding*)&encoding_DECdrawing },
    210   { ENC_SINGLE_94, 'A', (VTermEncoding*)&encoding_uk },
    211   { ENC_SINGLE_94, 'B', &encoding_usascii },
    212   { 0 },
    213 };
    214 
    215 /* This ought to be INTERNAL but isn't because it's used by unit testing */
    216 VTermEncoding *vterm_lookup_encoding(VTermEncodingType type, char designation)
    217 {
    218   for(int i = 0; encodings[i].designation; i++)
    219     if(encodings[i].type == type && encodings[i].designation == designation)
    220       return encodings[i].enc;
    221   return NULL;
    222 }
    223