1 /* Copyright 2013 Google Inc. All Rights Reserved. 2 3 Distributed under MIT license. 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 5 */ 6 7 /* Heuristics for deciding about the UTF8-ness of strings. */ 8 9 #include "./utf8_util.h" 10 11 #include <brotli/types.h> 12 13 #if defined(__cplusplus) || defined(c_plusplus) 14 extern "C" { 15 #endif 16 17 static size_t BrotliParseAsUTF8( 18 int* symbol, const uint8_t* input, size_t size) { 19 /* ASCII */ 20 if ((input[0] & 0x80) == 0) { 21 *symbol = input[0]; 22 if (*symbol > 0) { 23 return 1; 24 } 25 } 26 /* 2-byte UTF8 */ 27 if (size > 1u && 28 (input[0] & 0xe0) == 0xc0 && 29 (input[1] & 0xc0) == 0x80) { 30 *symbol = (((input[0] & 0x1f) << 6) | 31 (input[1] & 0x3f)); 32 if (*symbol > 0x7f) { 33 return 2; 34 } 35 } 36 /* 3-byte UFT8 */ 37 if (size > 2u && 38 (input[0] & 0xf0) == 0xe0 && 39 (input[1] & 0xc0) == 0x80 && 40 (input[2] & 0xc0) == 0x80) { 41 *symbol = (((input[0] & 0x0f) << 12) | 42 ((input[1] & 0x3f) << 6) | 43 (input[2] & 0x3f)); 44 if (*symbol > 0x7ff) { 45 return 3; 46 } 47 } 48 /* 4-byte UFT8 */ 49 if (size > 3u && 50 (input[0] & 0xf8) == 0xf0 && 51 (input[1] & 0xc0) == 0x80 && 52 (input[2] & 0xc0) == 0x80 && 53 (input[3] & 0xc0) == 0x80) { 54 *symbol = (((input[0] & 0x07) << 18) | 55 ((input[1] & 0x3f) << 12) | 56 ((input[2] & 0x3f) << 6) | 57 (input[3] & 0x3f)); 58 if (*symbol > 0xffff && *symbol <= 0x10ffff) { 59 return 4; 60 } 61 } 62 /* Not UTF8, emit a special symbol above the UTF8-code space */ 63 *symbol = 0x110000 | input[0]; 64 return 1; 65 } 66 67 /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/ 68 BROTLI_BOOL BrotliIsMostlyUTF8( 69 const uint8_t* data, const size_t pos, const size_t mask, 70 const size_t length, const double min_fraction) { 71 size_t size_utf8 = 0; 72 size_t i = 0; 73 while (i < length) { 74 int symbol; 75 size_t bytes_read = 76 BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i); 77 i += bytes_read; 78 if (symbol < 0x110000) size_utf8 += bytes_read; 79 } 80 return TO_BROTLI_BOOL(size_utf8 > min_fraction * (double)length); 81 } 82 83 #if defined(__cplusplus) || defined(c_plusplus) 84 } /* extern "C" */ 85 #endif 86