1 /* Copyright 2013 Google Inc. All Rights Reserved. 2 3 Distributed under MIT license. 4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT 5 */ 6 7 /* Transformations on dictionary words. */ 8 9 #ifndef BROTLI_DEC_TRANSFORM_H_ 10 #define BROTLI_DEC_TRANSFORM_H_ 11 12 #include <brotli/types.h> 13 #include "./port.h" 14 15 #if defined(__cplusplus) || defined(c_plusplus) 16 extern "C" { 17 #endif 18 19 enum WordTransformType { 20 kIdentity = 0, 21 kOmitLast1 = 1, 22 kOmitLast2 = 2, 23 kOmitLast3 = 3, 24 kOmitLast4 = 4, 25 kOmitLast5 = 5, 26 kOmitLast6 = 6, 27 kOmitLast7 = 7, 28 kOmitLast8 = 8, 29 kOmitLast9 = 9, 30 kUppercaseFirst = 10, 31 kUppercaseAll = 11, 32 kOmitFirst1 = 12, 33 kOmitFirst2 = 13, 34 kOmitFirst3 = 14, 35 kOmitFirst4 = 15, 36 kOmitFirst5 = 16, 37 kOmitFirst6 = 17, 38 kOmitFirst7 = 18, 39 kOmitFirst8 = 19, 40 kOmitFirst9 = 20 41 }; 42 43 typedef struct { 44 const uint8_t prefix_id; 45 const uint8_t transform; 46 const uint8_t suffix_id; 47 } Transform; 48 49 static const char kPrefixSuffix[208] = 50 "\0 \0, \0 of the \0 of \0s \0.\0 and \0 in \0\"\0 to \0\">\0\n\0. \0]\0" 51 " for \0 a \0 that \0\'\0 with \0 from \0 by \0(\0. The \0 on \0 as \0" 52 " is \0ing \0\n\t\0:\0ed \0=\"\0 at \0ly \0,\0=\'\0.com/\0. This \0" 53 " not \0er \0al \0ful \0ive \0less \0est \0ize \0\xc2\xa0\0ous "; 54 55 enum { 56 /* EMPTY = "" 57 SP = " " 58 DQUOT = "\"" 59 SQUOT = "'" 60 CLOSEBR = "]" 61 OPEN = "(" 62 SLASH = "/" 63 NBSP = non-breaking space "\0xc2\xa0" 64 */ 65 kPFix_EMPTY = 0, 66 kPFix_SP = 1, 67 kPFix_COMMASP = 3, 68 kPFix_SPofSPtheSP = 6, 69 kPFix_SPtheSP = 9, 70 kPFix_eSP = 12, 71 kPFix_SPofSP = 15, 72 kPFix_sSP = 20, 73 kPFix_DOT = 23, 74 kPFix_SPandSP = 25, 75 kPFix_SPinSP = 31, 76 kPFix_DQUOT = 36, 77 kPFix_SPtoSP = 38, 78 kPFix_DQUOTGT = 43, 79 kPFix_NEWLINE = 46, 80 kPFix_DOTSP = 48, 81 kPFix_CLOSEBR = 51, 82 kPFix_SPforSP = 53, 83 kPFix_SPaSP = 59, 84 kPFix_SPthatSP = 63, 85 kPFix_SQUOT = 70, 86 kPFix_SPwithSP = 72, 87 kPFix_SPfromSP = 79, 88 kPFix_SPbySP = 86, 89 kPFix_OPEN = 91, 90 kPFix_DOTSPTheSP = 93, 91 kPFix_SPonSP = 100, 92 kPFix_SPasSP = 105, 93 kPFix_SPisSP = 110, 94 kPFix_ingSP = 115, 95 kPFix_NEWLINETAB = 120, 96 kPFix_COLON = 123, 97 kPFix_edSP = 125, 98 kPFix_EQDQUOT = 129, 99 kPFix_SPatSP = 132, 100 kPFix_lySP = 137, 101 kPFix_COMMA = 141, 102 kPFix_EQSQUOT = 143, 103 kPFix_DOTcomSLASH = 146, 104 kPFix_DOTSPThisSP = 152, 105 kPFix_SPnotSP = 160, 106 kPFix_erSP = 166, 107 kPFix_alSP = 170, 108 kPFix_fulSP = 174, 109 kPFix_iveSP = 179, 110 kPFix_lessSP = 184, 111 kPFix_estSP = 190, 112 kPFix_izeSP = 195, 113 kPFix_NBSP = 200, 114 kPFix_ousSP = 203 115 }; 116 117 static const Transform kTransforms[] = { 118 { kPFix_EMPTY, kIdentity, kPFix_EMPTY }, 119 { kPFix_EMPTY, kIdentity, kPFix_SP }, 120 { kPFix_SP, kIdentity, kPFix_SP }, 121 { kPFix_EMPTY, kOmitFirst1, kPFix_EMPTY }, 122 { kPFix_EMPTY, kUppercaseFirst, kPFix_SP }, 123 { kPFix_EMPTY, kIdentity, kPFix_SPtheSP }, 124 { kPFix_SP, kIdentity, kPFix_EMPTY }, 125 { kPFix_sSP, kIdentity, kPFix_SP }, 126 { kPFix_EMPTY, kIdentity, kPFix_SPofSP }, 127 { kPFix_EMPTY, kUppercaseFirst, kPFix_EMPTY }, 128 { kPFix_EMPTY, kIdentity, kPFix_SPandSP }, 129 { kPFix_EMPTY, kOmitFirst2, kPFix_EMPTY }, 130 { kPFix_EMPTY, kOmitLast1, kPFix_EMPTY }, 131 { kPFix_COMMASP, kIdentity, kPFix_SP }, 132 { kPFix_EMPTY, kIdentity, kPFix_COMMASP }, 133 { kPFix_SP, kUppercaseFirst, kPFix_SP }, 134 { kPFix_EMPTY, kIdentity, kPFix_SPinSP }, 135 { kPFix_EMPTY, kIdentity, kPFix_SPtoSP }, 136 { kPFix_eSP, kIdentity, kPFix_SP }, 137 { kPFix_EMPTY, kIdentity, kPFix_DQUOT }, 138 { kPFix_EMPTY, kIdentity, kPFix_DOT }, 139 { kPFix_EMPTY, kIdentity, kPFix_DQUOTGT }, 140 { kPFix_EMPTY, kIdentity, kPFix_NEWLINE }, 141 { kPFix_EMPTY, kOmitLast3, kPFix_EMPTY }, 142 { kPFix_EMPTY, kIdentity, kPFix_CLOSEBR }, 143 { kPFix_EMPTY, kIdentity, kPFix_SPforSP }, 144 { kPFix_EMPTY, kOmitFirst3, kPFix_EMPTY }, 145 { kPFix_EMPTY, kOmitLast2, kPFix_EMPTY }, 146 { kPFix_EMPTY, kIdentity, kPFix_SPaSP }, 147 { kPFix_EMPTY, kIdentity, kPFix_SPthatSP }, 148 { kPFix_SP, kUppercaseFirst, kPFix_EMPTY }, 149 { kPFix_EMPTY, kIdentity, kPFix_DOTSP }, 150 { kPFix_DOT, kIdentity, kPFix_EMPTY }, 151 { kPFix_SP, kIdentity, kPFix_COMMASP }, 152 { kPFix_EMPTY, kOmitFirst4, kPFix_EMPTY }, 153 { kPFix_EMPTY, kIdentity, kPFix_SPwithSP }, 154 { kPFix_EMPTY, kIdentity, kPFix_SQUOT }, 155 { kPFix_EMPTY, kIdentity, kPFix_SPfromSP }, 156 { kPFix_EMPTY, kIdentity, kPFix_SPbySP }, 157 { kPFix_EMPTY, kOmitFirst5, kPFix_EMPTY }, 158 { kPFix_EMPTY, kOmitFirst6, kPFix_EMPTY }, 159 { kPFix_SPtheSP, kIdentity, kPFix_EMPTY }, 160 { kPFix_EMPTY, kOmitLast4, kPFix_EMPTY }, 161 { kPFix_EMPTY, kIdentity, kPFix_DOTSPTheSP }, 162 { kPFix_EMPTY, kUppercaseAll, kPFix_EMPTY }, 163 { kPFix_EMPTY, kIdentity, kPFix_SPonSP }, 164 { kPFix_EMPTY, kIdentity, kPFix_SPasSP }, 165 { kPFix_EMPTY, kIdentity, kPFix_SPisSP }, 166 { kPFix_EMPTY, kOmitLast7, kPFix_EMPTY }, 167 { kPFix_EMPTY, kOmitLast1, kPFix_ingSP }, 168 { kPFix_EMPTY, kIdentity, kPFix_NEWLINETAB }, 169 { kPFix_EMPTY, kIdentity, kPFix_COLON }, 170 { kPFix_SP, kIdentity, kPFix_DOTSP }, 171 { kPFix_EMPTY, kIdentity, kPFix_edSP }, 172 { kPFix_EMPTY, kOmitFirst9, kPFix_EMPTY }, 173 { kPFix_EMPTY, kOmitFirst7, kPFix_EMPTY }, 174 { kPFix_EMPTY, kOmitLast6, kPFix_EMPTY }, 175 { kPFix_EMPTY, kIdentity, kPFix_OPEN }, 176 { kPFix_EMPTY, kUppercaseFirst, kPFix_COMMASP }, 177 { kPFix_EMPTY, kOmitLast8, kPFix_EMPTY }, 178 { kPFix_EMPTY, kIdentity, kPFix_SPatSP }, 179 { kPFix_EMPTY, kIdentity, kPFix_lySP }, 180 { kPFix_SPtheSP, kIdentity, kPFix_SPofSP }, 181 { kPFix_EMPTY, kOmitLast5, kPFix_EMPTY }, 182 { kPFix_EMPTY, kOmitLast9, kPFix_EMPTY }, 183 { kPFix_SP, kUppercaseFirst, kPFix_COMMASP }, 184 { kPFix_EMPTY, kUppercaseFirst, kPFix_DQUOT }, 185 { kPFix_DOT, kIdentity, kPFix_OPEN }, 186 { kPFix_EMPTY, kUppercaseAll, kPFix_SP }, 187 { kPFix_EMPTY, kUppercaseFirst, kPFix_DQUOTGT }, 188 { kPFix_EMPTY, kIdentity, kPFix_EQDQUOT }, 189 { kPFix_SP, kIdentity, kPFix_DOT }, 190 { kPFix_DOTcomSLASH, kIdentity, kPFix_EMPTY }, 191 { kPFix_SPtheSP, kIdentity, kPFix_SPofSPtheSP }, 192 { kPFix_EMPTY, kUppercaseFirst, kPFix_SQUOT }, 193 { kPFix_EMPTY, kIdentity, kPFix_DOTSPThisSP }, 194 { kPFix_EMPTY, kIdentity, kPFix_COMMA }, 195 { kPFix_DOT, kIdentity, kPFix_SP }, 196 { kPFix_EMPTY, kUppercaseFirst, kPFix_OPEN }, 197 { kPFix_EMPTY, kUppercaseFirst, kPFix_DOT }, 198 { kPFix_EMPTY, kIdentity, kPFix_SPnotSP }, 199 { kPFix_SP, kIdentity, kPFix_EQDQUOT }, 200 { kPFix_EMPTY, kIdentity, kPFix_erSP }, 201 { kPFix_SP, kUppercaseAll, kPFix_SP }, 202 { kPFix_EMPTY, kIdentity, kPFix_alSP }, 203 { kPFix_SP, kUppercaseAll, kPFix_EMPTY }, 204 { kPFix_EMPTY, kIdentity, kPFix_EQSQUOT }, 205 { kPFix_EMPTY, kUppercaseAll, kPFix_DQUOT }, 206 { kPFix_EMPTY, kUppercaseFirst, kPFix_DOTSP }, 207 { kPFix_SP, kIdentity, kPFix_OPEN }, 208 { kPFix_EMPTY, kIdentity, kPFix_fulSP }, 209 { kPFix_SP, kUppercaseFirst, kPFix_DOTSP }, 210 { kPFix_EMPTY, kIdentity, kPFix_iveSP }, 211 { kPFix_EMPTY, kIdentity, kPFix_lessSP }, 212 { kPFix_EMPTY, kUppercaseAll, kPFix_SQUOT }, 213 { kPFix_EMPTY, kIdentity, kPFix_estSP }, 214 { kPFix_SP, kUppercaseFirst, kPFix_DOT }, 215 { kPFix_EMPTY, kUppercaseAll, kPFix_DQUOTGT }, 216 { kPFix_SP, kIdentity, kPFix_EQSQUOT }, 217 { kPFix_EMPTY, kUppercaseFirst, kPFix_COMMA }, 218 { kPFix_EMPTY, kIdentity, kPFix_izeSP }, 219 { kPFix_EMPTY, kUppercaseAll, kPFix_DOT }, 220 { kPFix_NBSP, kIdentity, kPFix_EMPTY }, 221 { kPFix_SP, kIdentity, kPFix_COMMA }, 222 { kPFix_EMPTY, kUppercaseFirst, kPFix_EQDQUOT }, 223 { kPFix_EMPTY, kUppercaseAll, kPFix_EQDQUOT }, 224 { kPFix_EMPTY, kIdentity, kPFix_ousSP }, 225 { kPFix_EMPTY, kUppercaseAll, kPFix_COMMASP }, 226 { kPFix_EMPTY, kUppercaseFirst, kPFix_EQSQUOT }, 227 { kPFix_SP, kUppercaseFirst, kPFix_COMMA }, 228 { kPFix_SP, kUppercaseAll, kPFix_EQDQUOT }, 229 { kPFix_SP, kUppercaseAll, kPFix_COMMASP }, 230 { kPFix_EMPTY, kUppercaseAll, kPFix_COMMA }, 231 { kPFix_EMPTY, kUppercaseAll, kPFix_OPEN }, 232 { kPFix_EMPTY, kUppercaseAll, kPFix_DOTSP }, 233 { kPFix_SP, kUppercaseAll, kPFix_DOT }, 234 { kPFix_EMPTY, kUppercaseAll, kPFix_EQSQUOT }, 235 { kPFix_SP, kUppercaseAll, kPFix_DOTSP }, 236 { kPFix_SP, kUppercaseFirst, kPFix_EQDQUOT }, 237 { kPFix_SP, kUppercaseAll, kPFix_EQSQUOT }, 238 { kPFix_SP, kUppercaseFirst, kPFix_EQSQUOT }, 239 }; 240 241 static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]); 242 243 static int ToUpperCase(uint8_t* p) { 244 if (p[0] < 0xc0) { 245 if (p[0] >= 'a' && p[0] <= 'z') { 246 p[0] ^= 32; 247 } 248 return 1; 249 } 250 /* An overly simplified uppercasing model for UTF-8. */ 251 if (p[0] < 0xe0) { 252 p[1] ^= 32; 253 return 2; 254 } 255 /* An arbitrary transform for three byte characters. */ 256 p[2] ^= 5; 257 return 3; 258 } 259 260 static BROTLI_NOINLINE int TransformDictionaryWord( 261 uint8_t* dst, const uint8_t* word, int len, int transform) { 262 int idx = 0; 263 { 264 const char* prefix = &kPrefixSuffix[kTransforms[transform].prefix_id]; 265 while (*prefix) { dst[idx++] = (uint8_t)*prefix++; } 266 } 267 { 268 const int t = kTransforms[transform].transform; 269 int i = 0; 270 int skip = t - (kOmitFirst1 - 1); 271 if (skip > 0) { 272 word += skip; 273 len -= skip; 274 } else if (t <= kOmitLast9) { 275 len -= t; 276 } 277 while (i < len) { dst[idx++] = word[i++]; } 278 if (t == kUppercaseFirst) { 279 ToUpperCase(&dst[idx - len]); 280 } else if (t == kUppercaseAll) { 281 uint8_t* uppercase = &dst[idx - len]; 282 while (len > 0) { 283 int step = ToUpperCase(uppercase); 284 uppercase += step; 285 len -= step; 286 } 287 } 288 } 289 { 290 const char* suffix = &kPrefixSuffix[kTransforms[transform].suffix_id]; 291 while (*suffix) { dst[idx++] = (uint8_t)*suffix++; } 292 return idx; 293 } 294 } 295 296 #if defined(__cplusplus) || defined(c_plusplus) 297 } /* extern "C" */ 298 #endif 299 300 #endif /* BROTLI_DEC_TRANSFORM_H_ */ 301