1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "compile/Pseudolocalizer.h" 18 19 #include "util/Util.h" 20 21 using android::StringPiece; 22 23 namespace aapt { 24 25 // String basis to generate expansion 26 static const std::string kExpansionString = 27 "one two three " 28 "four five six seven eight nine ten eleven twelve thirteen " 29 "fourteen fiveteen sixteen seventeen nineteen twenty"; 30 31 // Special unicode characters to override directionality of the words 32 static const std::string kRlm = "\u200f"; 33 static const std::string kRlo = "\u202e"; 34 static const std::string kPdf = "\u202c"; 35 36 // Placeholder marks 37 static const std::string kPlaceholderOpen = "\u00bb"; 38 static const std::string kPlaceholderClose = "\u00ab"; 39 40 static const char kArgStart = '{'; 41 static const char kArgEnd = '}'; 42 43 class PseudoMethodNone : public PseudoMethodImpl { 44 public: 45 std::string Text(const StringPiece& text) override { return text.to_string(); } 46 std::string Placeholder(const StringPiece& text) override { return text.to_string(); } 47 }; 48 49 class PseudoMethodBidi : public PseudoMethodImpl { 50 public: 51 std::string Text(const StringPiece& text) override; 52 std::string Placeholder(const StringPiece& text) override; 53 }; 54 55 class PseudoMethodAccent : public PseudoMethodImpl { 56 public: 57 PseudoMethodAccent() : depth_(0), word_count_(0), length_(0) {} 58 std::string Start() override; 59 std::string End() override; 60 std::string Text(const StringPiece& text) override; 61 std::string Placeholder(const StringPiece& text) override; 62 63 private: 64 size_t depth_; 65 size_t word_count_; 66 size_t length_; 67 }; 68 69 Pseudolocalizer::Pseudolocalizer(Method method) : last_depth_(0) { 70 SetMethod(method); 71 } 72 73 void Pseudolocalizer::SetMethod(Method method) { 74 switch (method) { 75 case Method::kNone: 76 impl_ = util::make_unique<PseudoMethodNone>(); 77 break; 78 case Method::kAccent: 79 impl_ = util::make_unique<PseudoMethodAccent>(); 80 break; 81 case Method::kBidi: 82 impl_ = util::make_unique<PseudoMethodBidi>(); 83 break; 84 } 85 } 86 87 std::string Pseudolocalizer::Text(const StringPiece& text) { 88 std::string out; 89 size_t depth = last_depth_; 90 size_t lastpos, pos; 91 const size_t length = text.size(); 92 const char* str = text.data(); 93 bool escaped = false; 94 for (lastpos = pos = 0; pos < length; pos++) { 95 char16_t c = str[pos]; 96 if (escaped) { 97 escaped = false; 98 continue; 99 } 100 if (c == '\'') { 101 escaped = true; 102 continue; 103 } 104 105 if (c == kArgStart) { 106 depth++; 107 } else if (c == kArgEnd && depth) { 108 depth--; 109 } 110 111 if (last_depth_ != depth || pos == length - 1) { 112 bool pseudo = ((last_depth_ % 2) == 0); 113 size_t nextpos = pos; 114 if (!pseudo || depth == last_depth_) { 115 nextpos++; 116 } 117 size_t size = nextpos - lastpos; 118 if (size) { 119 std::string chunk = text.substr(lastpos, size).to_string(); 120 if (pseudo) { 121 chunk = impl_->Text(chunk); 122 } else if (str[lastpos] == kArgStart && str[nextpos - 1] == kArgEnd) { 123 chunk = impl_->Placeholder(chunk); 124 } 125 out.append(chunk); 126 } 127 if (pseudo && depth < last_depth_) { // End of message 128 out.append(impl_->End()); 129 } else if (!pseudo && depth > last_depth_) { // Start of message 130 out.append(impl_->Start()); 131 } 132 lastpos = nextpos; 133 last_depth_ = depth; 134 } 135 } 136 return out; 137 } 138 139 static const char* PseudolocalizeChar(const char c) { 140 switch (c) { 141 case 'a': 142 return "\u00e5"; 143 case 'b': 144 return "\u0253"; 145 case 'c': 146 return "\u00e7"; 147 case 'd': 148 return "\u00f0"; 149 case 'e': 150 return "\u00e9"; 151 case 'f': 152 return "\u0192"; 153 case 'g': 154 return "\u011d"; 155 case 'h': 156 return "\u0125"; 157 case 'i': 158 return "\u00ee"; 159 case 'j': 160 return "\u0135"; 161 case 'k': 162 return "\u0137"; 163 case 'l': 164 return "\u013c"; 165 case 'm': 166 return "\u1e3f"; 167 case 'n': 168 return "\u00f1"; 169 case 'o': 170 return "\u00f6"; 171 case 'p': 172 return "\u00fe"; 173 case 'q': 174 return "\u0051"; 175 case 'r': 176 return "\u0155"; 177 case 's': 178 return "\u0161"; 179 case 't': 180 return "\u0163"; 181 case 'u': 182 return "\u00fb"; 183 case 'v': 184 return "\u0056"; 185 case 'w': 186 return "\u0175"; 187 case 'x': 188 return "\u0445"; 189 case 'y': 190 return "\u00fd"; 191 case 'z': 192 return "\u017e"; 193 case 'A': 194 return "\u00c5"; 195 case 'B': 196 return "\u03b2"; 197 case 'C': 198 return "\u00c7"; 199 case 'D': 200 return "\u00d0"; 201 case 'E': 202 return "\u00c9"; 203 case 'G': 204 return "\u011c"; 205 case 'H': 206 return "\u0124"; 207 case 'I': 208 return "\u00ce"; 209 case 'J': 210 return "\u0134"; 211 case 'K': 212 return "\u0136"; 213 case 'L': 214 return "\u013b"; 215 case 'M': 216 return "\u1e3e"; 217 case 'N': 218 return "\u00d1"; 219 case 'O': 220 return "\u00d6"; 221 case 'P': 222 return "\u00de"; 223 case 'Q': 224 return "\u0071"; 225 case 'R': 226 return "\u0154"; 227 case 'S': 228 return "\u0160"; 229 case 'T': 230 return "\u0162"; 231 case 'U': 232 return "\u00db"; 233 case 'V': 234 return "\u03bd"; 235 case 'W': 236 return "\u0174"; 237 case 'X': 238 return "\u00d7"; 239 case 'Y': 240 return "\u00dd"; 241 case 'Z': 242 return "\u017d"; 243 case '!': 244 return "\u00a1"; 245 case '?': 246 return "\u00bf"; 247 case '$': 248 return "\u20ac"; 249 default: 250 return nullptr; 251 } 252 } 253 254 static bool IsPossibleNormalPlaceholderEnd(const char c) { 255 switch (c) { 256 case 's': 257 return true; 258 case 'S': 259 return true; 260 case 'c': 261 return true; 262 case 'C': 263 return true; 264 case 'd': 265 return true; 266 case 'o': 267 return true; 268 case 'x': 269 return true; 270 case 'X': 271 return true; 272 case 'f': 273 return true; 274 case 'e': 275 return true; 276 case 'E': 277 return true; 278 case 'g': 279 return true; 280 case 'G': 281 return true; 282 case 'a': 283 return true; 284 case 'A': 285 return true; 286 case 'b': 287 return true; 288 case 'B': 289 return true; 290 case 'h': 291 return true; 292 case 'H': 293 return true; 294 case '%': 295 return true; 296 case 'n': 297 return true; 298 default: 299 return false; 300 } 301 } 302 303 static std::string PseudoGenerateExpansion(const unsigned int length) { 304 std::string result = kExpansionString; 305 const char* s = result.data(); 306 if (result.size() < length) { 307 result += " "; 308 result += PseudoGenerateExpansion(length - result.size()); 309 } else { 310 int ext = 0; 311 // Should contain only whole words, so looking for a space 312 for (unsigned int i = length + 1; i < result.size(); ++i) { 313 ++ext; 314 if (s[i] == ' ') { 315 break; 316 } 317 } 318 result = result.substr(0, length + ext); 319 } 320 return result; 321 } 322 323 std::string PseudoMethodAccent::Start() { 324 std::string result; 325 if (depth_ == 0) { 326 result = "["; 327 } 328 word_count_ = length_ = 0; 329 depth_++; 330 return result; 331 } 332 333 std::string PseudoMethodAccent::End() { 334 std::string result; 335 if (length_) { 336 result += " "; 337 result += PseudoGenerateExpansion(word_count_ > 3 ? length_ : length_ / 2); 338 } 339 word_count_ = length_ = 0; 340 depth_--; 341 if (depth_ == 0) { 342 result += "]"; 343 } 344 return result; 345 } 346 347 /** 348 * Converts characters so they look like they've been localized. 349 * 350 * Note: This leaves placeholder syntax untouched. 351 */ 352 std::string PseudoMethodAccent::Text(const StringPiece& source) { 353 const char* s = source.data(); 354 std::string result; 355 const size_t I = source.size(); 356 bool lastspace = true; 357 for (size_t i = 0; i < I; i++) { 358 char c = s[i]; 359 if (c == '%') { 360 // Placeholder syntax, no need to pseudolocalize 361 std::string chunk; 362 bool end = false; 363 chunk.append(&c, 1); 364 while (!end && i + 1 < I) { 365 ++i; 366 c = s[i]; 367 chunk.append(&c, 1); 368 if (IsPossibleNormalPlaceholderEnd(c)) { 369 end = true; 370 } else if (i + 1 < I && c == 't') { 371 ++i; 372 c = s[i]; 373 chunk.append(&c, 1); 374 end = true; 375 } 376 } 377 // Treat chunk as a placeholder unless it ends with %. 378 result += ((c == '%') ? chunk : Placeholder(chunk)); 379 } else if (c == '<' || c == '&') { 380 // html syntax, no need to pseudolocalize 381 bool tag_closed = false; 382 while (!tag_closed && i < I) { 383 if (c == '&') { 384 std::string escape_text; 385 escape_text.append(&c, 1); 386 bool end = false; 387 size_t html_code_pos = i; 388 while (!end && html_code_pos < I) { 389 ++html_code_pos; 390 c = s[html_code_pos]; 391 escape_text.append(&c, 1); 392 // Valid html code 393 if (c == ';') { 394 end = true; 395 i = html_code_pos; 396 } 397 // Wrong html code 398 else if (!((c == '#' || (c >= 'a' && c <= 'z') || 399 (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')))) { 400 end = true; 401 } 402 } 403 result += escape_text; 404 if (escape_text != "<") { 405 tag_closed = true; 406 } 407 continue; 408 } 409 if (c == '>') { 410 tag_closed = true; 411 result.append(&c, 1); 412 continue; 413 } 414 result.append(&c, 1); 415 i++; 416 c = s[i]; 417 } 418 } else { 419 // This is a pure text that should be pseudolocalized 420 const char* p = PseudolocalizeChar(c); 421 if (p != nullptr) { 422 result += p; 423 } else { 424 bool space = isspace(c); 425 if (lastspace && !space) { 426 word_count_++; 427 } 428 lastspace = space; 429 result.append(&c, 1); 430 } 431 // Count only pseudolocalizable chars and delimiters 432 length_++; 433 } 434 } 435 return result; 436 } 437 438 std::string PseudoMethodAccent::Placeholder(const StringPiece& source) { 439 // Surround a placeholder with brackets 440 return kPlaceholderOpen + source.to_string() + kPlaceholderClose; 441 } 442 443 std::string PseudoMethodBidi::Text(const StringPiece& source) { 444 const char* s = source.data(); 445 std::string result; 446 bool lastspace = true; 447 bool space = true; 448 bool escape = false; 449 const char ESCAPE_CHAR = '\\'; 450 for (size_t i = 0; i < source.size(); i++) { 451 char c = s[i]; 452 if (!escape && c == ESCAPE_CHAR) { 453 escape = true; 454 continue; 455 } 456 space = (!escape && isspace(c)) || (escape && (c == 'n' || c == 't')); 457 if (lastspace && !space) { 458 // Word start 459 result += kRlm + kRlo; 460 } else if (!lastspace && space) { 461 // Word end 462 result += kPdf + kRlm; 463 } 464 lastspace = space; 465 if (escape) { 466 result.append(&ESCAPE_CHAR, 1); 467 escape=false; 468 } 469 result.append(&c, 1); 470 } 471 if (!lastspace) { 472 // End of last word 473 result += kPdf + kRlm; 474 } 475 return result; 476 } 477 478 std::string PseudoMethodBidi::Placeholder(const StringPiece& source) { 479 // Surround a placeholder with directionality change sequence 480 return kRlm + kRlo + source.to_string() + kPdf + kRlm; 481 } 482 483 } // namespace aapt 484