Home | History | Annotate | Download | only in dec
      1 /* Copyright 2013 Google Inc. All Rights Reserved.
      2 
      3    Distributed under MIT license.
      4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
      5 */
      6 
      7 /* Transformations on dictionary words. */
      8 
      9 #ifndef BROTLI_DEC_TRANSFORM_H_
     10 #define BROTLI_DEC_TRANSFORM_H_
     11 
     12 #include <brotli/types.h>
     13 #include "./port.h"
     14 
     15 #if defined(__cplusplus) || defined(c_plusplus)
     16 extern "C" {
     17 #endif
     18 
     19 enum WordTransformType {
     20   kIdentity = 0,
     21   kOmitLast1 = 1,
     22   kOmitLast2 = 2,
     23   kOmitLast3 = 3,
     24   kOmitLast4 = 4,
     25   kOmitLast5 = 5,
     26   kOmitLast6 = 6,
     27   kOmitLast7 = 7,
     28   kOmitLast8 = 8,
     29   kOmitLast9 = 9,
     30   kUppercaseFirst = 10,
     31   kUppercaseAll = 11,
     32   kOmitFirst1 = 12,
     33   kOmitFirst2 = 13,
     34   kOmitFirst3 = 14,
     35   kOmitFirst4 = 15,
     36   kOmitFirst5 = 16,
     37   kOmitFirst6 = 17,
     38   kOmitFirst7 = 18,
     39   kOmitFirst8 = 19,
     40   kOmitFirst9 = 20
     41 };
     42 
     43 typedef struct {
     44   const uint8_t prefix_id;
     45   const uint8_t transform;
     46   const uint8_t suffix_id;
     47 } Transform;
     48 
     49 static const char kPrefixSuffix[208] =
     50     "\0 \0, \0 of the \0 of \0s \0.\0 and \0 in \0\"\0 to \0\">\0\n\0. \0]\0"
     51     " for \0 a \0 that \0\'\0 with \0 from \0 by \0(\0. The \0 on \0 as \0"
     52     " is \0ing \0\n\t\0:\0ed \0=\"\0 at \0ly \0,\0=\'\0.com/\0. This \0"
     53     " not \0er \0al \0ful \0ive \0less \0est \0ize \0\xc2\xa0\0ous ";
     54 
     55 enum {
     56   /* EMPTY = ""
     57      SP = " "
     58      DQUOT = "\""
     59      SQUOT = "'"
     60      CLOSEBR = "]"
     61      OPEN = "("
     62      SLASH = "/"
     63      NBSP = non-breaking space "\0xc2\xa0"
     64   */
     65   kPFix_EMPTY = 0,
     66   kPFix_SP = 1,
     67   kPFix_COMMASP = 3,
     68   kPFix_SPofSPtheSP = 6,
     69   kPFix_SPtheSP = 9,
     70   kPFix_eSP = 12,
     71   kPFix_SPofSP = 15,
     72   kPFix_sSP = 20,
     73   kPFix_DOT = 23,
     74   kPFix_SPandSP = 25,
     75   kPFix_SPinSP = 31,
     76   kPFix_DQUOT = 36,
     77   kPFix_SPtoSP = 38,
     78   kPFix_DQUOTGT = 43,
     79   kPFix_NEWLINE = 46,
     80   kPFix_DOTSP = 48,
     81   kPFix_CLOSEBR = 51,
     82   kPFix_SPforSP = 53,
     83   kPFix_SPaSP = 59,
     84   kPFix_SPthatSP = 63,
     85   kPFix_SQUOT = 70,
     86   kPFix_SPwithSP = 72,
     87   kPFix_SPfromSP = 79,
     88   kPFix_SPbySP = 86,
     89   kPFix_OPEN = 91,
     90   kPFix_DOTSPTheSP = 93,
     91   kPFix_SPonSP = 100,
     92   kPFix_SPasSP = 105,
     93   kPFix_SPisSP = 110,
     94   kPFix_ingSP = 115,
     95   kPFix_NEWLINETAB = 120,
     96   kPFix_COLON = 123,
     97   kPFix_edSP = 125,
     98   kPFix_EQDQUOT = 129,
     99   kPFix_SPatSP = 132,
    100   kPFix_lySP = 137,
    101   kPFix_COMMA = 141,
    102   kPFix_EQSQUOT = 143,
    103   kPFix_DOTcomSLASH = 146,
    104   kPFix_DOTSPThisSP = 152,
    105   kPFix_SPnotSP = 160,
    106   kPFix_erSP = 166,
    107   kPFix_alSP = 170,
    108   kPFix_fulSP = 174,
    109   kPFix_iveSP = 179,
    110   kPFix_lessSP = 184,
    111   kPFix_estSP = 190,
    112   kPFix_izeSP = 195,
    113   kPFix_NBSP = 200,
    114   kPFix_ousSP = 203
    115 };
    116 
    117 static const Transform kTransforms[] = {
    118   { kPFix_EMPTY, kIdentity, kPFix_EMPTY },
    119   { kPFix_EMPTY, kIdentity, kPFix_SP },
    120   { kPFix_SP, kIdentity, kPFix_SP },
    121   { kPFix_EMPTY, kOmitFirst1, kPFix_EMPTY },
    122   { kPFix_EMPTY, kUppercaseFirst, kPFix_SP },
    123   { kPFix_EMPTY, kIdentity, kPFix_SPtheSP },
    124   { kPFix_SP, kIdentity, kPFix_EMPTY },
    125   { kPFix_sSP, kIdentity, kPFix_SP },
    126   { kPFix_EMPTY, kIdentity, kPFix_SPofSP },
    127   { kPFix_EMPTY, kUppercaseFirst, kPFix_EMPTY },
    128   { kPFix_EMPTY, kIdentity, kPFix_SPandSP },
    129   { kPFix_EMPTY, kOmitFirst2, kPFix_EMPTY },
    130   { kPFix_EMPTY, kOmitLast1, kPFix_EMPTY },
    131   { kPFix_COMMASP, kIdentity, kPFix_SP },
    132   { kPFix_EMPTY, kIdentity, kPFix_COMMASP },
    133   { kPFix_SP, kUppercaseFirst, kPFix_SP },
    134   { kPFix_EMPTY, kIdentity, kPFix_SPinSP },
    135   { kPFix_EMPTY, kIdentity, kPFix_SPtoSP },
    136   { kPFix_eSP, kIdentity, kPFix_SP },
    137   { kPFix_EMPTY, kIdentity, kPFix_DQUOT },
    138   { kPFix_EMPTY, kIdentity, kPFix_DOT },
    139   { kPFix_EMPTY, kIdentity, kPFix_DQUOTGT },
    140   { kPFix_EMPTY, kIdentity, kPFix_NEWLINE },
    141   { kPFix_EMPTY, kOmitLast3, kPFix_EMPTY },
    142   { kPFix_EMPTY, kIdentity, kPFix_CLOSEBR },
    143   { kPFix_EMPTY, kIdentity, kPFix_SPforSP },
    144   { kPFix_EMPTY, kOmitFirst3, kPFix_EMPTY },
    145   { kPFix_EMPTY, kOmitLast2, kPFix_EMPTY },
    146   { kPFix_EMPTY, kIdentity, kPFix_SPaSP },
    147   { kPFix_EMPTY, kIdentity, kPFix_SPthatSP },
    148   { kPFix_SP, kUppercaseFirst, kPFix_EMPTY },
    149   { kPFix_EMPTY, kIdentity, kPFix_DOTSP },
    150   { kPFix_DOT, kIdentity, kPFix_EMPTY },
    151   { kPFix_SP, kIdentity, kPFix_COMMASP },
    152   { kPFix_EMPTY, kOmitFirst4, kPFix_EMPTY },
    153   { kPFix_EMPTY, kIdentity, kPFix_SPwithSP },
    154   { kPFix_EMPTY, kIdentity, kPFix_SQUOT },
    155   { kPFix_EMPTY, kIdentity, kPFix_SPfromSP },
    156   { kPFix_EMPTY, kIdentity, kPFix_SPbySP },
    157   { kPFix_EMPTY, kOmitFirst5, kPFix_EMPTY },
    158   { kPFix_EMPTY, kOmitFirst6, kPFix_EMPTY },
    159   { kPFix_SPtheSP, kIdentity, kPFix_EMPTY },
    160   { kPFix_EMPTY, kOmitLast4, kPFix_EMPTY },
    161   { kPFix_EMPTY, kIdentity, kPFix_DOTSPTheSP },
    162   { kPFix_EMPTY, kUppercaseAll, kPFix_EMPTY },
    163   { kPFix_EMPTY, kIdentity, kPFix_SPonSP },
    164   { kPFix_EMPTY, kIdentity, kPFix_SPasSP },
    165   { kPFix_EMPTY, kIdentity, kPFix_SPisSP },
    166   { kPFix_EMPTY, kOmitLast7, kPFix_EMPTY },
    167   { kPFix_EMPTY, kOmitLast1, kPFix_ingSP },
    168   { kPFix_EMPTY, kIdentity, kPFix_NEWLINETAB },
    169   { kPFix_EMPTY, kIdentity, kPFix_COLON },
    170   { kPFix_SP, kIdentity, kPFix_DOTSP },
    171   { kPFix_EMPTY, kIdentity, kPFix_edSP },
    172   { kPFix_EMPTY, kOmitFirst9, kPFix_EMPTY },
    173   { kPFix_EMPTY, kOmitFirst7, kPFix_EMPTY },
    174   { kPFix_EMPTY, kOmitLast6, kPFix_EMPTY },
    175   { kPFix_EMPTY, kIdentity, kPFix_OPEN },
    176   { kPFix_EMPTY, kUppercaseFirst, kPFix_COMMASP },
    177   { kPFix_EMPTY, kOmitLast8, kPFix_EMPTY },
    178   { kPFix_EMPTY, kIdentity, kPFix_SPatSP },
    179   { kPFix_EMPTY, kIdentity, kPFix_lySP },
    180   { kPFix_SPtheSP, kIdentity, kPFix_SPofSP },
    181   { kPFix_EMPTY, kOmitLast5, kPFix_EMPTY },
    182   { kPFix_EMPTY, kOmitLast9, kPFix_EMPTY },
    183   { kPFix_SP, kUppercaseFirst, kPFix_COMMASP },
    184   { kPFix_EMPTY, kUppercaseFirst, kPFix_DQUOT },
    185   { kPFix_DOT, kIdentity, kPFix_OPEN },
    186   { kPFix_EMPTY, kUppercaseAll, kPFix_SP },
    187   { kPFix_EMPTY, kUppercaseFirst, kPFix_DQUOTGT },
    188   { kPFix_EMPTY, kIdentity, kPFix_EQDQUOT },
    189   { kPFix_SP, kIdentity, kPFix_DOT },
    190   { kPFix_DOTcomSLASH, kIdentity, kPFix_EMPTY },
    191   { kPFix_SPtheSP, kIdentity, kPFix_SPofSPtheSP },
    192   { kPFix_EMPTY, kUppercaseFirst, kPFix_SQUOT },
    193   { kPFix_EMPTY, kIdentity, kPFix_DOTSPThisSP },
    194   { kPFix_EMPTY, kIdentity, kPFix_COMMA },
    195   { kPFix_DOT, kIdentity, kPFix_SP },
    196   { kPFix_EMPTY, kUppercaseFirst, kPFix_OPEN },
    197   { kPFix_EMPTY, kUppercaseFirst, kPFix_DOT },
    198   { kPFix_EMPTY, kIdentity, kPFix_SPnotSP },
    199   { kPFix_SP, kIdentity, kPFix_EQDQUOT },
    200   { kPFix_EMPTY, kIdentity, kPFix_erSP },
    201   { kPFix_SP, kUppercaseAll, kPFix_SP },
    202   { kPFix_EMPTY, kIdentity, kPFix_alSP },
    203   { kPFix_SP, kUppercaseAll, kPFix_EMPTY },
    204   { kPFix_EMPTY, kIdentity, kPFix_EQSQUOT },
    205   { kPFix_EMPTY, kUppercaseAll, kPFix_DQUOT },
    206   { kPFix_EMPTY, kUppercaseFirst, kPFix_DOTSP },
    207   { kPFix_SP, kIdentity, kPFix_OPEN },
    208   { kPFix_EMPTY, kIdentity, kPFix_fulSP },
    209   { kPFix_SP, kUppercaseFirst, kPFix_DOTSP },
    210   { kPFix_EMPTY, kIdentity, kPFix_iveSP },
    211   { kPFix_EMPTY, kIdentity, kPFix_lessSP },
    212   { kPFix_EMPTY, kUppercaseAll, kPFix_SQUOT },
    213   { kPFix_EMPTY, kIdentity, kPFix_estSP },
    214   { kPFix_SP, kUppercaseFirst, kPFix_DOT },
    215   { kPFix_EMPTY, kUppercaseAll, kPFix_DQUOTGT },
    216   { kPFix_SP, kIdentity, kPFix_EQSQUOT },
    217   { kPFix_EMPTY, kUppercaseFirst, kPFix_COMMA },
    218   { kPFix_EMPTY, kIdentity, kPFix_izeSP },
    219   { kPFix_EMPTY, kUppercaseAll, kPFix_DOT },
    220   { kPFix_NBSP, kIdentity, kPFix_EMPTY },
    221   { kPFix_SP, kIdentity, kPFix_COMMA },
    222   { kPFix_EMPTY, kUppercaseFirst, kPFix_EQDQUOT },
    223   { kPFix_EMPTY, kUppercaseAll, kPFix_EQDQUOT },
    224   { kPFix_EMPTY, kIdentity, kPFix_ousSP },
    225   { kPFix_EMPTY, kUppercaseAll, kPFix_COMMASP },
    226   { kPFix_EMPTY, kUppercaseFirst, kPFix_EQSQUOT },
    227   { kPFix_SP, kUppercaseFirst, kPFix_COMMA },
    228   { kPFix_SP, kUppercaseAll, kPFix_EQDQUOT },
    229   { kPFix_SP, kUppercaseAll, kPFix_COMMASP },
    230   { kPFix_EMPTY, kUppercaseAll, kPFix_COMMA },
    231   { kPFix_EMPTY, kUppercaseAll, kPFix_OPEN },
    232   { kPFix_EMPTY, kUppercaseAll, kPFix_DOTSP },
    233   { kPFix_SP, kUppercaseAll, kPFix_DOT },
    234   { kPFix_EMPTY, kUppercaseAll, kPFix_EQSQUOT },
    235   { kPFix_SP, kUppercaseAll, kPFix_DOTSP },
    236   { kPFix_SP, kUppercaseFirst, kPFix_EQDQUOT },
    237   { kPFix_SP, kUppercaseAll, kPFix_EQSQUOT },
    238   { kPFix_SP, kUppercaseFirst, kPFix_EQSQUOT },
    239 };
    240 
    241 static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]);
    242 
    243 static int ToUpperCase(uint8_t* p) {
    244   if (p[0] < 0xc0) {
    245     if (p[0] >= 'a' && p[0] <= 'z') {
    246       p[0] ^= 32;
    247     }
    248     return 1;
    249   }
    250   /* An overly simplified uppercasing model for UTF-8. */
    251   if (p[0] < 0xe0) {
    252     p[1] ^= 32;
    253     return 2;
    254   }
    255   /* An arbitrary transform for three byte characters. */
    256   p[2] ^= 5;
    257   return 3;
    258 }
    259 
    260 static BROTLI_NOINLINE int TransformDictionaryWord(
    261     uint8_t* dst, const uint8_t* word, int len, int transform) {
    262   int idx = 0;
    263   {
    264     const char* prefix = &kPrefixSuffix[kTransforms[transform].prefix_id];
    265     while (*prefix) { dst[idx++] = (uint8_t)*prefix++; }
    266   }
    267   {
    268     const int t = kTransforms[transform].transform;
    269     int i = 0;
    270     int skip = t - (kOmitFirst1 - 1);
    271     if (skip > 0) {
    272       word += skip;
    273       len -= skip;
    274     } else if (t <= kOmitLast9) {
    275       len -= t;
    276     }
    277     while (i < len) { dst[idx++] = word[i++]; }
    278     if (t == kUppercaseFirst) {
    279       ToUpperCase(&dst[idx - len]);
    280     } else if (t == kUppercaseAll) {
    281       uint8_t* uppercase = &dst[idx - len];
    282       while (len > 0) {
    283         int step = ToUpperCase(uppercase);
    284         uppercase += step;
    285         len -= step;
    286       }
    287     }
    288   }
    289   {
    290     const char* suffix = &kPrefixSuffix[kTransforms[transform].suffix_id];
    291     while (*suffix) { dst[idx++] = (uint8_t)*suffix++; }
    292     return idx;
    293   }
    294 }
    295 
    296 #if defined(__cplusplus) || defined(c_plusplus)
    297 }  /* extern "C" */
    298 #endif
    299 
    300 #endif  /* BROTLI_DEC_TRANSFORM_H_ */
    301