Home | History | Annotate | Download | only in enc
      1 // Copyright 2010 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 // http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 //
     15 // Transformations on dictionary words.
     16 
     17 #ifndef BROTLI_ENC_TRANSFORM_H_
     18 #define BROTLI_ENC_TRANSFORM_H_
     19 
     20 #include <string>
     21 
     22 #include "./dictionary.h"
     23 
     24 namespace brotli {
     25 
     26 enum WordTransformType {
     27   kIdentity       = 0,
     28   kOmitLast1      = 1,
     29   kOmitLast2      = 2,
     30   kOmitLast3      = 3,
     31   kOmitLast4      = 4,
     32   kOmitLast5      = 5,
     33   kOmitLast6      = 6,
     34   kOmitLast7      = 7,
     35   kOmitLast8      = 8,
     36   kOmitLast9      = 9,
     37   kUppercaseFirst = 10,
     38   kUppercaseAll   = 11,
     39   kOmitFirst1     = 12,
     40   kOmitFirst2     = 13,
     41   kOmitFirst3     = 14,
     42   kOmitFirst4     = 15,
     43   kOmitFirst5     = 16,
     44   kOmitFirst6     = 17,
     45   kOmitFirst7     = 18,
     46   kOmitFirst8     = 19,
     47   kOmitFirst9     = 20,
     48 };
     49 
     50 struct Transform {
     51   const char* prefix;
     52   WordTransformType word_transform;
     53   const char* suffix;
     54 };
     55 
     56 static const Transform kTransforms[] = {
     57      {         "", kIdentity,       ""           },
     58      {         "", kIdentity,       " "          },
     59      {        " ", kIdentity,       " "          },
     60      {         "", kOmitFirst1,     ""           },
     61      {         "", kUppercaseFirst, " "          },
     62      {         "", kIdentity,       " the "      },
     63      {        " ", kIdentity,       ""           },
     64      {       "s ", kIdentity,       " "          },
     65      {         "", kIdentity,       " of "       },
     66      {         "", kUppercaseFirst, ""           },
     67      {         "", kIdentity,       " and "      },
     68      {         "", kOmitFirst2,     ""           },
     69      {         "", kOmitLast1,      ""           },
     70      {       ", ", kIdentity,       " "          },
     71      {         "", kIdentity,       ", "         },
     72      {        " ", kUppercaseFirst, " "          },
     73      {         "", kIdentity,       " in "       },
     74      {         "", kIdentity,       " to "       },
     75      {       "e ", kIdentity,       " "          },
     76      {         "", kIdentity,       "\""         },
     77      {         "", kIdentity,       "."          },
     78      {         "", kIdentity,       "\">"        },
     79      {         "", kIdentity,       "\n"         },
     80      {         "", kOmitLast3,      ""           },
     81      {         "", kIdentity,       "]"          },
     82      {         "", kIdentity,       " for "      },
     83      {         "", kOmitFirst3,     ""           },
     84      {         "", kOmitLast2,      ""           },
     85      {         "", kIdentity,       " a "        },
     86      {         "", kIdentity,       " that "     },
     87      {        " ", kUppercaseFirst, ""           },
     88      {         "", kIdentity,       ". "         },
     89      {        ".", kIdentity,       ""           },
     90      {        " ", kIdentity,       ", "         },
     91      {         "", kOmitFirst4,     ""           },
     92      {         "", kIdentity,       " with "     },
     93      {         "", kIdentity,       "'"          },
     94      {         "", kIdentity,       " from "     },
     95      {         "", kIdentity,       " by "       },
     96      {         "", kOmitFirst5,     ""           },
     97      {         "", kOmitFirst6,     ""           },
     98      {    " the ", kIdentity,       ""           },
     99      {         "", kOmitLast4,      ""           },
    100      {         "", kIdentity,       ". The "     },
    101      {         "", kUppercaseAll,   ""           },
    102      {         "", kIdentity,       " on "       },
    103      {         "", kIdentity,       " as "       },
    104      {         "", kIdentity,       " is "       },
    105      {         "", kOmitLast7,      ""           },
    106      {         "", kOmitLast1,      "ing "       },
    107      {         "", kIdentity,       "\n\t"       },
    108      {         "", kIdentity,       ":"          },
    109      {        " ", kIdentity,       ". "         },
    110      {         "", kIdentity,       "ed "        },
    111      {         "", kOmitFirst9,     ""           },
    112      {         "", kOmitFirst7,     ""           },
    113      {         "", kOmitLast6,      ""           },
    114      {         "", kIdentity,       "("          },
    115      {         "", kUppercaseFirst, ", "         },
    116      {         "", kOmitLast8,      ""           },
    117      {         "", kIdentity,       " at "       },
    118      {         "", kIdentity,       "ly "        },
    119      {    " the ", kIdentity,       " of "       },
    120      {         "", kOmitLast5,      ""           },
    121      {         "", kOmitLast9,      ""           },
    122      {        " ", kUppercaseFirst, ", "         },
    123      {         "", kUppercaseFirst, "\""         },
    124      {        ".", kIdentity,       "("          },
    125      {         "", kUppercaseAll,   " "          },
    126      {         "", kUppercaseFirst, "\">"        },
    127      {         "", kIdentity,       "=\""        },
    128      {        " ", kIdentity,       "."          },
    129      {    ".com/", kIdentity,       ""           },
    130      {    " the ", kIdentity,       " of the "   },
    131      {         "", kUppercaseFirst, "'"          },
    132      {         "", kIdentity,       ". This "    },
    133      {         "", kIdentity,       ","          },
    134      {        ".", kIdentity,       " "          },
    135      {         "", kUppercaseFirst, "("          },
    136      {         "", kUppercaseFirst, "."          },
    137      {         "", kIdentity,       " not "      },
    138      {        " ", kIdentity,       "=\""        },
    139      {         "", kIdentity,       "er "        },
    140      {        " ", kUppercaseAll,   " "          },
    141      {         "", kIdentity,       "al "        },
    142      {        " ", kUppercaseAll,   ""           },
    143      {         "", kIdentity,       "='"         },
    144      {         "", kUppercaseAll,   "\""         },
    145      {         "", kUppercaseFirst, ". "         },
    146      {        " ", kIdentity,       "("          },
    147      {         "", kIdentity,       "ful "       },
    148      {        " ", kUppercaseFirst, ". "         },
    149      {         "", kIdentity,       "ive "       },
    150      {         "", kIdentity,       "less "      },
    151      {         "", kUppercaseAll,   "'"          },
    152      {         "", kIdentity,       "est "       },
    153      {        " ", kUppercaseFirst, "."          },
    154      {         "", kUppercaseAll,   "\">"        },
    155      {        " ", kIdentity,       "='"         },
    156      {         "", kUppercaseFirst, ","          },
    157      {         "", kIdentity,       "ize "       },
    158      {         "", kUppercaseAll,   "."          },
    159      { "\xc2\xa0", kIdentity,       ""           },
    160      {        " ", kIdentity,       ","          },
    161      {         "", kUppercaseFirst, "=\""        },
    162      {         "", kUppercaseAll,   "=\""        },
    163      {         "", kIdentity,       "ous "       },
    164      {         "", kUppercaseAll,   ", "         },
    165      {         "", kUppercaseFirst, "='"         },
    166      {        " ", kUppercaseFirst, ","          },
    167      {        " ", kUppercaseAll,   "=\""        },
    168      {        " ", kUppercaseAll,   ", "         },
    169      {         "", kUppercaseAll,   ","          },
    170      {         "", kUppercaseAll,   "("          },
    171      {         "", kUppercaseAll,   ". "         },
    172      {        " ", kUppercaseAll,   "."          },
    173      {         "", kUppercaseAll,   "='"         },
    174      {        " ", kUppercaseAll,   ". "         },
    175      {        " ", kUppercaseFirst, "=\""        },
    176      {        " ", kUppercaseAll,   "='"         },
    177      {        " ", kUppercaseFirst, "='"         },
    178 };
    179 
    180 static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]);
    181 
    182 static int ToUpperCase(uint8_t *p, int len) {
    183   if (len == 1 || p[0] < 0xc0) {
    184     if (p[0] >= 'a' && p[0] <= 'z') {
    185       p[0] ^= 32;
    186     }
    187     return 1;
    188   }
    189   if (p[0] < 0xe0) {
    190     p[1] ^= 32;
    191     return 2;
    192   }
    193   if (len == 2) {
    194     return 2;
    195   }
    196   p[2] ^= 5;
    197   return 3;
    198 }
    199 
    200 inline std::string ApplyTransform(
    201     const Transform& t, const uint8_t* word, int len) {
    202   std::string ret(t.prefix);
    203   if (t.word_transform <= kOmitLast9) {
    204     len -= t.word_transform;
    205   }
    206   if (len > 0) {
    207     if (t.word_transform >= kOmitFirst1) {
    208       const int skip = t.word_transform - (kOmitFirst1 - 1);
    209       if (len > skip) {
    210         ret += std::string(word + skip, word + len);
    211       }
    212     } else {
    213       ret += std::string(word, word + len);
    214       uint8_t *uppercase = reinterpret_cast<uint8_t*>(&ret[ret.size() - len]);
    215       if (t.word_transform == kUppercaseFirst) {
    216         ToUpperCase(uppercase, len);
    217       } else if (t.word_transform == kUppercaseAll) {
    218         while (len > 0) {
    219           int step = ToUpperCase(uppercase, len);
    220           uppercase += step;
    221           len -= step;
    222         }
    223       }
    224     }
    225   }
    226   ret += std::string(t.suffix);
    227   return ret;
    228 }
    229 
    230 inline std::string GetTransformedDictionaryWord(int len_code, int word_id) {
    231   int num_words = 1 << kBrotliDictionarySizeBitsByLength[len_code];
    232   int offset = kBrotliDictionaryOffsetsByLength[len_code];
    233   int t = word_id / num_words;
    234   int word_idx = word_id % num_words;
    235   offset += len_code * word_idx;
    236   const uint8_t* word = &kBrotliDictionary[offset];
    237   return ApplyTransform(kTransforms[t], word, len_code);
    238 }
    239 
    240 }  // namespace brotli
    241 
    242 #endif  // BROTLI_ENC_TRANSFORM_H_
    243