Home | History | Annotate | Download | only in enc
      1 /* Copyright 2013 Google Inc. All Rights Reserved.
      2 
      3    Distributed under MIT license.
      4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
      5 */
      6 
      7 /* Heuristics for deciding about the UTF8-ness of strings. */
      8 
      9 #include "./utf8_util.h"
     10 
     11 #include <brotli/types.h>
     12 
     13 #if defined(__cplusplus) || defined(c_plusplus)
     14 extern "C" {
     15 #endif
     16 
     17 static size_t BrotliParseAsUTF8(
     18     int* symbol, const uint8_t* input, size_t size) {
     19   /* ASCII */
     20   if ((input[0] & 0x80) == 0) {
     21     *symbol = input[0];
     22     if (*symbol > 0) {
     23       return 1;
     24     }
     25   }
     26   /* 2-byte UTF8 */
     27   if (size > 1u &&
     28       (input[0] & 0xe0) == 0xc0 &&
     29       (input[1] & 0xc0) == 0x80) {
     30     *symbol = (((input[0] & 0x1f) << 6) |
     31                (input[1] & 0x3f));
     32     if (*symbol > 0x7f) {
     33       return 2;
     34     }
     35   }
     36   /* 3-byte UFT8 */
     37   if (size > 2u &&
     38       (input[0] & 0xf0) == 0xe0 &&
     39       (input[1] & 0xc0) == 0x80 &&
     40       (input[2] & 0xc0) == 0x80) {
     41     *symbol = (((input[0] & 0x0f) << 12) |
     42                ((input[1] & 0x3f) << 6) |
     43                (input[2] & 0x3f));
     44     if (*symbol > 0x7ff) {
     45       return 3;
     46     }
     47   }
     48   /* 4-byte UFT8 */
     49   if (size > 3u &&
     50       (input[0] & 0xf8) == 0xf0 &&
     51       (input[1] & 0xc0) == 0x80 &&
     52       (input[2] & 0xc0) == 0x80 &&
     53       (input[3] & 0xc0) == 0x80) {
     54     *symbol = (((input[0] & 0x07) << 18) |
     55                ((input[1] & 0x3f) << 12) |
     56                ((input[2] & 0x3f) << 6) |
     57                (input[3] & 0x3f));
     58     if (*symbol > 0xffff && *symbol <= 0x10ffff) {
     59       return 4;
     60     }
     61   }
     62   /* Not UTF8, emit a special symbol above the UTF8-code space */
     63   *symbol = 0x110000 | input[0];
     64   return 1;
     65 }
     66 
     67 /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
     68 BROTLI_BOOL BrotliIsMostlyUTF8(
     69     const uint8_t* data, const size_t pos, const size_t mask,
     70     const size_t length, const double min_fraction) {
     71   size_t size_utf8 = 0;
     72   size_t i = 0;
     73   while (i < length) {
     74     int symbol;
     75     size_t bytes_read =
     76         BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
     77     i += bytes_read;
     78     if (symbol < 0x110000) size_utf8 += bytes_read;
     79   }
     80   return TO_BROTLI_BOOL(size_utf8 > min_fraction * (double)length);
     81 }
     82 
     83 #if defined(__cplusplus) || defined(c_plusplus)
     84 }  /* extern "C" */
     85 #endif
     86