1 #include "vterm_internal.h" 2 3 #define UNICODE_INVALID 0xFFFD 4 5 #if defined(DEBUG) && DEBUG > 1 6 # define DEBUG_PRINT_UTF8 7 #endif 8 9 struct UTF8DecoderData { 10 // number of bytes remaining in this codepoint 11 int bytes_remaining; 12 13 // number of bytes total in this codepoint once it's finished 14 // (for detecting overlongs) 15 int bytes_total; 16 17 int this_cp; 18 }; 19 20 static void init_utf8(VTermEncoding *enc, void *data_) 21 { 22 struct UTF8DecoderData *data = data_; 23 24 data->bytes_remaining = 0; 25 data->bytes_total = 0; 26 } 27 28 static void decode_utf8(VTermEncoding *enc, void *data_, 29 uint32_t cp[], int *cpi, int cplen, 30 const char bytes[], size_t *pos, size_t bytelen) 31 { 32 struct UTF8DecoderData *data = data_; 33 34 #ifdef DEBUG_PRINT_UTF8 35 printf("BEGIN UTF-8\n"); 36 #endif 37 38 for(; *pos < bytelen && *cpi < cplen; (*pos)++) { 39 unsigned char c = bytes[*pos]; 40 41 #ifdef DEBUG_PRINT_UTF8 42 printf(" pos=%zd c=%02x rem=%d\n", *pos, c, data->bytes_remaining); 43 #endif 44 45 if(c < 0x20) 46 return; 47 48 else if(c >= 0x20 && c < 0x80) { 49 if(data->bytes_remaining) 50 cp[(*cpi)++] = UNICODE_INVALID; 51 52 cp[(*cpi)++] = c; 53 #ifdef DEBUG_PRINT_UTF8 54 printf(" UTF-8 char: U+%04x\n", c); 55 #endif 56 data->bytes_remaining = 0; 57 } 58 59 else if(c >= 0x80 && c < 0xc0) { 60 if(!data->bytes_remaining) { 61 cp[(*cpi)++] = UNICODE_INVALID; 62 continue; 63 } 64 65 data->this_cp <<= 6; 66 data->this_cp |= c & 0x3f; 67 data->bytes_remaining--; 68 69 if(!data->bytes_remaining) { 70 #ifdef DEBUG_PRINT_UTF8 71 printf(" UTF-8 raw char U+%04x bytelen=%d ", data->this_cp, data->bytes_total); 72 #endif 73 // Check for overlong sequences 74 switch(data->bytes_total) { 75 case 2: 76 if(data->this_cp < 0x0080) data->this_cp = UNICODE_INVALID; break; 77 case 3: 78 if(data->this_cp < 0x0800) data->this_cp = UNICODE_INVALID; break; 79 case 4: 80 if(data->this_cp < 0x10000) data->this_cp = UNICODE_INVALID; break; 81 case 5: 82 if(data->this_cp < 0x200000) data->this_cp = UNICODE_INVALID; break; 83 case 6: 84 if(data->this_cp < 0x4000000) data->this_cp = UNICODE_INVALID; break; 85 } 86 // Now look for plain invalid ones 87 if((data->this_cp >= 0xD800 && data->this_cp <= 0xDFFF) || 88 data->this_cp == 0xFFFE || 89 data->this_cp == 0xFFFF) 90 data->this_cp = UNICODE_INVALID; 91 #ifdef DEBUG_PRINT_UTF8 92 printf(" char: U+%04x\n", data->this_cp); 93 #endif 94 cp[(*cpi)++] = data->this_cp; 95 } 96 } 97 98 else if(c >= 0xc0 && c < 0xe0) { 99 if(data->bytes_remaining) 100 cp[(*cpi)++] = UNICODE_INVALID; 101 102 data->this_cp = c & 0x1f; 103 data->bytes_total = 2; 104 data->bytes_remaining = 1; 105 } 106 107 else if(c >= 0xe0 && c < 0xf0) { 108 if(data->bytes_remaining) 109 cp[(*cpi)++] = UNICODE_INVALID; 110 111 data->this_cp = c & 0x0f; 112 data->bytes_total = 3; 113 data->bytes_remaining = 2; 114 } 115 116 else if(c >= 0xf0 && c < 0xf8) { 117 if(data->bytes_remaining) 118 cp[(*cpi)++] = UNICODE_INVALID; 119 120 data->this_cp = c & 0x07; 121 data->bytes_total = 4; 122 data->bytes_remaining = 3; 123 } 124 125 else if(c >= 0xf8 && c < 0xfc) { 126 if(data->bytes_remaining) 127 cp[(*cpi)++] = UNICODE_INVALID; 128 129 data->this_cp = c & 0x03; 130 data->bytes_total = 5; 131 data->bytes_remaining = 4; 132 } 133 134 else if(c >= 0xfc && c < 0xfe) { 135 if(data->bytes_remaining) 136 cp[(*cpi)++] = UNICODE_INVALID; 137 138 data->this_cp = c & 0x01; 139 data->bytes_total = 6; 140 data->bytes_remaining = 5; 141 } 142 143 else { 144 cp[(*cpi)++] = UNICODE_INVALID; 145 } 146 } 147 } 148 149 static VTermEncoding encoding_utf8 = { 150 .init = &init_utf8, 151 .decode = &decode_utf8, 152 }; 153 154 static void decode_usascii(VTermEncoding *enc, void *data, 155 uint32_t cp[], int *cpi, int cplen, 156 const char bytes[], size_t *pos, size_t bytelen) 157 { 158 int is_gr = bytes[*pos] & 0x80; 159 160 for(; *pos < bytelen && *cpi < cplen; (*pos)++) { 161 unsigned char c = bytes[*pos] ^ is_gr; 162 163 if(c < 0x20 || c >= 0x80) 164 return; 165 166 cp[(*cpi)++] = c; 167 } 168 } 169 170 static VTermEncoding encoding_usascii = { 171 .decode = &decode_usascii, 172 }; 173 174 struct StaticTableEncoding { 175 const VTermEncoding enc; 176 const uint32_t chars[128]; 177 }; 178 179 static void decode_table(VTermEncoding *enc, void *data, 180 uint32_t cp[], int *cpi, int cplen, 181 const char bytes[], size_t *pos, size_t bytelen) 182 { 183 struct StaticTableEncoding *table = (struct StaticTableEncoding *)enc; 184 int is_gr = bytes[*pos] & 0x80; 185 186 for(; *pos < bytelen && *cpi < cplen; (*pos)++) { 187 unsigned char c = bytes[*pos] ^ is_gr; 188 189 if(c < 0x20 || c >= 0x80) 190 return; 191 192 if(table->chars[c]) 193 cp[(*cpi)++] = table->chars[c]; 194 else 195 cp[(*cpi)++] = c; 196 } 197 } 198 199 #include "encoding/DECdrawing.inc" 200 #include "encoding/uk.inc" 201 202 static struct { 203 VTermEncodingType type; 204 char designation; 205 VTermEncoding *enc; 206 } 207 encodings[] = { 208 { ENC_UTF8, 'u', &encoding_utf8 }, 209 { ENC_SINGLE_94, '0', (VTermEncoding*)&encoding_DECdrawing }, 210 { ENC_SINGLE_94, 'A', (VTermEncoding*)&encoding_uk }, 211 { ENC_SINGLE_94, 'B', &encoding_usascii }, 212 { 0 }, 213 }; 214 215 /* This ought to be INTERNAL but isn't because it's used by unit testing */ 216 VTermEncoding *vterm_lookup_encoding(VTermEncodingType type, char designation) 217 { 218 for(int i = 0; encodings[i].designation; i++) 219 if(encodings[i].type == type && encodings[i].designation == designation) 220 return encodings[i].enc; 221 return NULL; 222 } 223