1 // Copyright 2010 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // Transformations on dictionary words. 16 17 #ifndef BROTLI_ENC_TRANSFORM_H_ 18 #define BROTLI_ENC_TRANSFORM_H_ 19 20 #include <string> 21 22 #include "./dictionary.h" 23 24 namespace brotli { 25 26 enum WordTransformType { 27 kIdentity = 0, 28 kOmitLast1 = 1, 29 kOmitLast2 = 2, 30 kOmitLast3 = 3, 31 kOmitLast4 = 4, 32 kOmitLast5 = 5, 33 kOmitLast6 = 6, 34 kOmitLast7 = 7, 35 kOmitLast8 = 8, 36 kOmitLast9 = 9, 37 kUppercaseFirst = 10, 38 kUppercaseAll = 11, 39 kOmitFirst1 = 12, 40 kOmitFirst2 = 13, 41 kOmitFirst3 = 14, 42 kOmitFirst4 = 15, 43 kOmitFirst5 = 16, 44 kOmitFirst6 = 17, 45 kOmitFirst7 = 18, 46 kOmitFirst8 = 19, 47 kOmitFirst9 = 20, 48 }; 49 50 struct Transform { 51 const char* prefix; 52 WordTransformType word_transform; 53 const char* suffix; 54 }; 55 56 static const Transform kTransforms[] = { 57 { "", kIdentity, "" }, 58 { "", kIdentity, " " }, 59 { " ", kIdentity, " " }, 60 { "", kOmitFirst1, "" }, 61 { "", kUppercaseFirst, " " }, 62 { "", kIdentity, " the " }, 63 { " ", kIdentity, "" }, 64 { "s ", kIdentity, " " }, 65 { "", kIdentity, " of " }, 66 { "", kUppercaseFirst, "" }, 67 { "", kIdentity, " and " }, 68 { "", kOmitFirst2, "" }, 69 { "", kOmitLast1, "" }, 70 { ", ", kIdentity, " " }, 71 { "", kIdentity, ", " }, 72 { " ", kUppercaseFirst, " " }, 73 { "", kIdentity, " in " }, 74 { "", kIdentity, " to " }, 75 { "e ", kIdentity, " " }, 76 { "", kIdentity, "\"" }, 77 { "", kIdentity, "." }, 78 { "", kIdentity, "\">" }, 79 { "", kIdentity, "\n" }, 80 { "", kOmitLast3, "" }, 81 { "", kIdentity, "]" }, 82 { "", kIdentity, " for " }, 83 { "", kOmitFirst3, "" }, 84 { "", kOmitLast2, "" }, 85 { "", kIdentity, " a " }, 86 { "", kIdentity, " that " }, 87 { " ", kUppercaseFirst, "" }, 88 { "", kIdentity, ". " }, 89 { ".", kIdentity, "" }, 90 { " ", kIdentity, ", " }, 91 { "", kOmitFirst4, "" }, 92 { "", kIdentity, " with " }, 93 { "", kIdentity, "'" }, 94 { "", kIdentity, " from " }, 95 { "", kIdentity, " by " }, 96 { "", kOmitFirst5, "" }, 97 { "", kOmitFirst6, "" }, 98 { " the ", kIdentity, "" }, 99 { "", kOmitLast4, "" }, 100 { "", kIdentity, ". The " }, 101 { "", kUppercaseAll, "" }, 102 { "", kIdentity, " on " }, 103 { "", kIdentity, " as " }, 104 { "", kIdentity, " is " }, 105 { "", kOmitLast7, "" }, 106 { "", kOmitLast1, "ing " }, 107 { "", kIdentity, "\n\t" }, 108 { "", kIdentity, ":" }, 109 { " ", kIdentity, ". " }, 110 { "", kIdentity, "ed " }, 111 { "", kOmitFirst9, "" }, 112 { "", kOmitFirst7, "" }, 113 { "", kOmitLast6, "" }, 114 { "", kIdentity, "(" }, 115 { "", kUppercaseFirst, ", " }, 116 { "", kOmitLast8, "" }, 117 { "", kIdentity, " at " }, 118 { "", kIdentity, "ly " }, 119 { " the ", kIdentity, " of " }, 120 { "", kOmitLast5, "" }, 121 { "", kOmitLast9, "" }, 122 { " ", kUppercaseFirst, ", " }, 123 { "", kUppercaseFirst, "\"" }, 124 { ".", kIdentity, "(" }, 125 { "", kUppercaseAll, " " }, 126 { "", kUppercaseFirst, "\">" }, 127 { "", kIdentity, "=\"" }, 128 { " ", kIdentity, "." }, 129 { ".com/", kIdentity, "" }, 130 { " the ", kIdentity, " of the " }, 131 { "", kUppercaseFirst, "'" }, 132 { "", kIdentity, ". This " }, 133 { "", kIdentity, "," }, 134 { ".", kIdentity, " " }, 135 { "", kUppercaseFirst, "(" }, 136 { "", kUppercaseFirst, "." }, 137 { "", kIdentity, " not " }, 138 { " ", kIdentity, "=\"" }, 139 { "", kIdentity, "er " }, 140 { " ", kUppercaseAll, " " }, 141 { "", kIdentity, "al " }, 142 { " ", kUppercaseAll, "" }, 143 { "", kIdentity, "='" }, 144 { "", kUppercaseAll, "\"" }, 145 { "", kUppercaseFirst, ". " }, 146 { " ", kIdentity, "(" }, 147 { "", kIdentity, "ful " }, 148 { " ", kUppercaseFirst, ". " }, 149 { "", kIdentity, "ive " }, 150 { "", kIdentity, "less " }, 151 { "", kUppercaseAll, "'" }, 152 { "", kIdentity, "est " }, 153 { " ", kUppercaseFirst, "." }, 154 { "", kUppercaseAll, "\">" }, 155 { " ", kIdentity, "='" }, 156 { "", kUppercaseFirst, "," }, 157 { "", kIdentity, "ize " }, 158 { "", kUppercaseAll, "." }, 159 { "\xc2\xa0", kIdentity, "" }, 160 { " ", kIdentity, "," }, 161 { "", kUppercaseFirst, "=\"" }, 162 { "", kUppercaseAll, "=\"" }, 163 { "", kIdentity, "ous " }, 164 { "", kUppercaseAll, ", " }, 165 { "", kUppercaseFirst, "='" }, 166 { " ", kUppercaseFirst, "," }, 167 { " ", kUppercaseAll, "=\"" }, 168 { " ", kUppercaseAll, ", " }, 169 { "", kUppercaseAll, "," }, 170 { "", kUppercaseAll, "(" }, 171 { "", kUppercaseAll, ". " }, 172 { " ", kUppercaseAll, "." }, 173 { "", kUppercaseAll, "='" }, 174 { " ", kUppercaseAll, ". " }, 175 { " ", kUppercaseFirst, "=\"" }, 176 { " ", kUppercaseAll, "='" }, 177 { " ", kUppercaseFirst, "='" }, 178 }; 179 180 static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]); 181 182 static int ToUpperCase(uint8_t *p, int len) { 183 if (len == 1 || p[0] < 0xc0) { 184 if (p[0] >= 'a' && p[0] <= 'z') { 185 p[0] ^= 32; 186 } 187 return 1; 188 } 189 if (p[0] < 0xe0) { 190 p[1] ^= 32; 191 return 2; 192 } 193 if (len == 2) { 194 return 2; 195 } 196 p[2] ^= 5; 197 return 3; 198 } 199 200 inline std::string ApplyTransform( 201 const Transform& t, const uint8_t* word, int len) { 202 std::string ret(t.prefix); 203 if (t.word_transform <= kOmitLast9) { 204 len -= t.word_transform; 205 } 206 if (len > 0) { 207 if (t.word_transform >= kOmitFirst1) { 208 const int skip = t.word_transform - (kOmitFirst1 - 1); 209 if (len > skip) { 210 ret += std::string(word + skip, word + len); 211 } 212 } else { 213 ret += std::string(word, word + len); 214 uint8_t *uppercase = reinterpret_cast<uint8_t*>(&ret[ret.size() - len]); 215 if (t.word_transform == kUppercaseFirst) { 216 ToUpperCase(uppercase, len); 217 } else if (t.word_transform == kUppercaseAll) { 218 while (len > 0) { 219 int step = ToUpperCase(uppercase, len); 220 uppercase += step; 221 len -= step; 222 } 223 } 224 } 225 } 226 ret += std::string(t.suffix); 227 return ret; 228 } 229 230 inline std::string GetTransformedDictionaryWord(int len_code, int word_id) { 231 int num_words = 1 << kBrotliDictionarySizeBitsByLength[len_code]; 232 int offset = kBrotliDictionaryOffsetsByLength[len_code]; 233 int t = word_id / num_words; 234 int word_idx = word_id % num_words; 235 offset += len_code * word_idx; 236 const uint8_t* word = &kBrotliDictionary[offset]; 237 return ApplyTransform(kTransforms[t], word, len_code); 238 } 239 240 } // namespace brotli 241 242 #endif // BROTLI_ENC_TRANSFORM_H_ 243