1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "compile/Pseudolocalizer.h" 18 #include "util/Util.h" 19 20 namespace aapt { 21 22 // String basis to generate expansion 23 static const std::u16string k_expansion_string = u"one two three " 24 "four five six seven eight nine ten eleven twelve thirteen " 25 "fourteen fiveteen sixteen seventeen nineteen twenty"; 26 27 // Special unicode characters to override directionality of the words 28 static const std::u16string k_rlm = u"\u200f"; 29 static const std::u16string k_rlo = u"\u202e"; 30 static const std::u16string k_pdf = u"\u202c"; 31 32 // Placeholder marks 33 static const std::u16string k_placeholder_open = u"\u00bb"; 34 static const std::u16string k_placeholder_close = u"\u00ab"; 35 36 static const char16_t k_arg_start = u'{'; 37 static const char16_t k_arg_end = u'}'; 38 39 class PseudoMethodNone : public PseudoMethodImpl { 40 public: 41 std::u16string text(const StringPiece16& text) override { return text.toString(); } 42 std::u16string placeholder(const StringPiece16& text) override { return text.toString(); } 43 }; 44 45 class PseudoMethodBidi : public PseudoMethodImpl { 46 public: 47 std::u16string text(const StringPiece16& text) override; 48 std::u16string placeholder(const StringPiece16& text) override; 49 }; 50 51 class PseudoMethodAccent : public PseudoMethodImpl { 52 public: 53 PseudoMethodAccent() : mDepth(0), mWordCount(0), mLength(0) {} 54 std::u16string start() override; 55 std::u16string end() override; 56 std::u16string text(const StringPiece16& text) override; 57 std::u16string placeholder(const StringPiece16& text) override; 58 private: 59 size_t mDepth; 60 size_t mWordCount; 61 size_t mLength; 62 }; 63 64 Pseudolocalizer::Pseudolocalizer(Method method) : mLastDepth(0) { 65 setMethod(method); 66 } 67 68 void Pseudolocalizer::setMethod(Method method) { 69 switch (method) { 70 case Method::kNone: 71 mImpl = util::make_unique<PseudoMethodNone>(); 72 break; 73 case Method::kAccent: 74 mImpl = util::make_unique<PseudoMethodAccent>(); 75 break; 76 case Method::kBidi: 77 mImpl = util::make_unique<PseudoMethodBidi>(); 78 break; 79 } 80 } 81 82 std::u16string Pseudolocalizer::text(const StringPiece16& text) { 83 std::u16string out; 84 size_t depth = mLastDepth; 85 size_t lastpos, pos; 86 const size_t length = text.size(); 87 const char16_t* str = text.data(); 88 bool escaped = false; 89 for (lastpos = pos = 0; pos < length; pos++) { 90 char16_t c = str[pos]; 91 if (escaped) { 92 escaped = false; 93 continue; 94 } 95 if (c == '\'') { 96 escaped = true; 97 continue; 98 } 99 100 if (c == k_arg_start) { 101 depth++; 102 } else if (c == k_arg_end && depth) { 103 depth--; 104 } 105 106 if (mLastDepth != depth || pos == length - 1) { 107 bool pseudo = ((mLastDepth % 2) == 0); 108 size_t nextpos = pos; 109 if (!pseudo || depth == mLastDepth) { 110 nextpos++; 111 } 112 size_t size = nextpos - lastpos; 113 if (size) { 114 std::u16string chunk = text.substr(lastpos, size).toString(); 115 if (pseudo) { 116 chunk = mImpl->text(chunk); 117 } else if (str[lastpos] == k_arg_start && str[nextpos - 1] == k_arg_end) { 118 chunk = mImpl->placeholder(chunk); 119 } 120 out.append(chunk); 121 } 122 if (pseudo && depth < mLastDepth) { // End of message 123 out.append(mImpl->end()); 124 } else if (!pseudo && depth > mLastDepth) { // Start of message 125 out.append(mImpl->start()); 126 } 127 lastpos = nextpos; 128 mLastDepth = depth; 129 } 130 } 131 return out; 132 } 133 134 static const char16_t* pseudolocalizeChar(const char16_t c) { 135 switch (c) { 136 case 'a': return u"\u00e5"; 137 case 'b': return u"\u0253"; 138 case 'c': return u"\u00e7"; 139 case 'd': return u"\u00f0"; 140 case 'e': return u"\u00e9"; 141 case 'f': return u"\u0192"; 142 case 'g': return u"\u011d"; 143 case 'h': return u"\u0125"; 144 case 'i': return u"\u00ee"; 145 case 'j': return u"\u0135"; 146 case 'k': return u"\u0137"; 147 case 'l': return u"\u013c"; 148 case 'm': return u"\u1e3f"; 149 case 'n': return u"\u00f1"; 150 case 'o': return u"\u00f6"; 151 case 'p': return u"\u00fe"; 152 case 'q': return u"\u0051"; 153 case 'r': return u"\u0155"; 154 case 's': return u"\u0161"; 155 case 't': return u"\u0163"; 156 case 'u': return u"\u00fb"; 157 case 'v': return u"\u0056"; 158 case 'w': return u"\u0175"; 159 case 'x': return u"\u0445"; 160 case 'y': return u"\u00fd"; 161 case 'z': return u"\u017e"; 162 case 'A': return u"\u00c5"; 163 case 'B': return u"\u03b2"; 164 case 'C': return u"\u00c7"; 165 case 'D': return u"\u00d0"; 166 case 'E': return u"\u00c9"; 167 case 'G': return u"\u011c"; 168 case 'H': return u"\u0124"; 169 case 'I': return u"\u00ce"; 170 case 'J': return u"\u0134"; 171 case 'K': return u"\u0136"; 172 case 'L': return u"\u013b"; 173 case 'M': return u"\u1e3e"; 174 case 'N': return u"\u00d1"; 175 case 'O': return u"\u00d6"; 176 case 'P': return u"\u00de"; 177 case 'Q': return u"\u0071"; 178 case 'R': return u"\u0154"; 179 case 'S': return u"\u0160"; 180 case 'T': return u"\u0162"; 181 case 'U': return u"\u00db"; 182 case 'V': return u"\u03bd"; 183 case 'W': return u"\u0174"; 184 case 'X': return u"\u00d7"; 185 case 'Y': return u"\u00dd"; 186 case 'Z': return u"\u017d"; 187 case '!': return u"\u00a1"; 188 case '?': return u"\u00bf"; 189 case '$': return u"\u20ac"; 190 default: return NULL; 191 } 192 } 193 194 static bool isPossibleNormalPlaceholderEnd(const char16_t c) { 195 switch (c) { 196 case 's': return true; 197 case 'S': return true; 198 case 'c': return true; 199 case 'C': return true; 200 case 'd': return true; 201 case 'o': return true; 202 case 'x': return true; 203 case 'X': return true; 204 case 'f': return true; 205 case 'e': return true; 206 case 'E': return true; 207 case 'g': return true; 208 case 'G': return true; 209 case 'a': return true; 210 case 'A': return true; 211 case 'b': return true; 212 case 'B': return true; 213 case 'h': return true; 214 case 'H': return true; 215 case '%': return true; 216 case 'n': return true; 217 default: return false; 218 } 219 } 220 221 static std::u16string pseudoGenerateExpansion(const unsigned int length) { 222 std::u16string result = k_expansion_string; 223 const char16_t* s = result.data(); 224 if (result.size() < length) { 225 result += u" "; 226 result += pseudoGenerateExpansion(length - result.size()); 227 } else { 228 int ext = 0; 229 // Should contain only whole words, so looking for a space 230 for (unsigned int i = length + 1; i < result.size(); ++i) { 231 ++ext; 232 if (s[i] == ' ') { 233 break; 234 } 235 } 236 result = result.substr(0, length + ext); 237 } 238 return result; 239 } 240 241 std::u16string PseudoMethodAccent::start() { 242 std::u16string result; 243 if (mDepth == 0) { 244 result = u"["; 245 } 246 mWordCount = mLength = 0; 247 mDepth++; 248 return result; 249 } 250 251 std::u16string PseudoMethodAccent::end() { 252 std::u16string result; 253 if (mLength) { 254 result += u" "; 255 result += pseudoGenerateExpansion(mWordCount > 3 ? mLength : mLength / 2); 256 } 257 mWordCount = mLength = 0; 258 mDepth--; 259 if (mDepth == 0) { 260 result += u"]"; 261 } 262 return result; 263 } 264 265 /** 266 * Converts characters so they look like they've been localized. 267 * 268 * Note: This leaves placeholder syntax untouched. 269 */ 270 std::u16string PseudoMethodAccent::text(const StringPiece16& source) 271 { 272 const char16_t* s = source.data(); 273 std::u16string result; 274 const size_t I = source.size(); 275 bool lastspace = true; 276 for (size_t i = 0; i < I; i++) { 277 char16_t c = s[i]; 278 if (c == '%') { 279 // Placeholder syntax, no need to pseudolocalize 280 std::u16string chunk; 281 bool end = false; 282 chunk.append(&c, 1); 283 while (!end && i < I) { 284 ++i; 285 c = s[i]; 286 chunk.append(&c, 1); 287 if (isPossibleNormalPlaceholderEnd(c)) { 288 end = true; 289 } else if (c == 't') { 290 ++i; 291 c = s[i]; 292 chunk.append(&c, 1); 293 end = true; 294 } 295 } 296 // Treat chunk as a placeholder unless it ends with %. 297 result += ((c == '%') ? chunk : placeholder(chunk)); 298 } else if (c == '<' || c == '&') { 299 // html syntax, no need to pseudolocalize 300 bool tag_closed = false; 301 while (!tag_closed && i < I) { 302 if (c == '&') { 303 std::u16string escapeText; 304 escapeText.append(&c, 1); 305 bool end = false; 306 size_t htmlCodePos = i; 307 while (!end && htmlCodePos < I) { 308 ++htmlCodePos; 309 c = s[htmlCodePos]; 310 escapeText.append(&c, 1); 311 // Valid html code 312 if (c == ';') { 313 end = true; 314 i = htmlCodePos; 315 } 316 // Wrong html code 317 else if (!((c == '#' || 318 (c >= 'a' && c <= 'z') || 319 (c >= 'A' && c <= 'Z') || 320 (c >= '0' && c <= '9')))) { 321 end = true; 322 } 323 } 324 result += escapeText; 325 if (escapeText != u"<") { 326 tag_closed = true; 327 } 328 continue; 329 } 330 if (c == '>') { 331 tag_closed = true; 332 result.append(&c, 1); 333 continue; 334 } 335 result.append(&c, 1); 336 i++; 337 c = s[i]; 338 } 339 } else { 340 // This is a pure text that should be pseudolocalized 341 const char16_t* p = pseudolocalizeChar(c); 342 if (p != nullptr) { 343 result += p; 344 } else { 345 bool space = util::isspace16(c); 346 if (lastspace && !space) { 347 mWordCount++; 348 } 349 lastspace = space; 350 result.append(&c, 1); 351 } 352 // Count only pseudolocalizable chars and delimiters 353 mLength++; 354 } 355 } 356 return result; 357 } 358 359 std::u16string PseudoMethodAccent::placeholder(const StringPiece16& source) { 360 // Surround a placeholder with brackets 361 return k_placeholder_open + source.toString() + k_placeholder_close; 362 } 363 364 std::u16string PseudoMethodBidi::text(const StringPiece16& source) { 365 const char16_t* s = source.data(); 366 std::u16string result; 367 bool lastspace = true; 368 bool space = true; 369 for (size_t i = 0; i < source.size(); i++) { 370 char16_t c = s[i]; 371 space = util::isspace16(c); 372 if (lastspace && !space) { 373 // Word start 374 result += k_rlm + k_rlo; 375 } else if (!lastspace && space) { 376 // Word end 377 result += k_pdf + k_rlm; 378 } 379 lastspace = space; 380 result.append(&c, 1); 381 } 382 if (!lastspace) { 383 // End of last word 384 result += k_pdf + k_rlm; 385 } 386 return result; 387 } 388 389 std::u16string PseudoMethodBidi::placeholder(const StringPiece16& source) { 390 // Surround a placeholder with directionality change sequence 391 return k_rlm + k_rlo + source.toString() + k_pdf + k_rlm; 392 } 393 394 } // namespace aapt 395