Home | History | Annotate | Download | only in compile
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "compile/Pseudolocalizer.h"
     18 
     19 #include "util/Util.h"
     20 
     21 using android::StringPiece;
     22 
     23 namespace aapt {
     24 
     25 // String basis to generate expansion
     26 static const std::string kExpansionString =
     27     "one two three "
     28     "four five six seven eight nine ten eleven twelve thirteen "
     29     "fourteen fiveteen sixteen seventeen nineteen twenty";
     30 
     31 // Special unicode characters to override directionality of the words
     32 static const std::string kRlm = "\u200f";
     33 static const std::string kRlo = "\u202e";
     34 static const std::string kPdf = "\u202c";
     35 
     36 // Placeholder marks
     37 static const std::string kPlaceholderOpen = "\u00bb";
     38 static const std::string kPlaceholderClose = "\u00ab";
     39 
     40 static const char kArgStart = '{';
     41 static const char kArgEnd = '}';
     42 
     43 class PseudoMethodNone : public PseudoMethodImpl {
     44  public:
     45   std::string Text(const StringPiece& text) override { return text.to_string(); }
     46   std::string Placeholder(const StringPiece& text) override { return text.to_string(); }
     47 };
     48 
     49 class PseudoMethodBidi : public PseudoMethodImpl {
     50  public:
     51   std::string Text(const StringPiece& text) override;
     52   std::string Placeholder(const StringPiece& text) override;
     53 };
     54 
     55 class PseudoMethodAccent : public PseudoMethodImpl {
     56  public:
     57   PseudoMethodAccent() : depth_(0), word_count_(0), length_(0) {}
     58   std::string Start() override;
     59   std::string End() override;
     60   std::string Text(const StringPiece& text) override;
     61   std::string Placeholder(const StringPiece& text) override;
     62 
     63  private:
     64   size_t depth_;
     65   size_t word_count_;
     66   size_t length_;
     67 };
     68 
     69 Pseudolocalizer::Pseudolocalizer(Method method) : last_depth_(0) {
     70   SetMethod(method);
     71 }
     72 
     73 void Pseudolocalizer::SetMethod(Method method) {
     74   switch (method) {
     75     case Method::kNone:
     76       impl_ = util::make_unique<PseudoMethodNone>();
     77       break;
     78     case Method::kAccent:
     79       impl_ = util::make_unique<PseudoMethodAccent>();
     80       break;
     81     case Method::kBidi:
     82       impl_ = util::make_unique<PseudoMethodBidi>();
     83       break;
     84   }
     85 }
     86 
     87 std::string Pseudolocalizer::Text(const StringPiece& text) {
     88   std::string out;
     89   size_t depth = last_depth_;
     90   size_t lastpos, pos;
     91   const size_t length = text.size();
     92   const char* str = text.data();
     93   bool escaped = false;
     94   for (lastpos = pos = 0; pos < length; pos++) {
     95     char16_t c = str[pos];
     96     if (escaped) {
     97       escaped = false;
     98       continue;
     99     }
    100     if (c == '\'') {
    101       escaped = true;
    102       continue;
    103     }
    104 
    105     if (c == kArgStart) {
    106       depth++;
    107     } else if (c == kArgEnd && depth) {
    108       depth--;
    109     }
    110 
    111     if (last_depth_ != depth || pos == length - 1) {
    112       bool pseudo = ((last_depth_ % 2) == 0);
    113       size_t nextpos = pos;
    114       if (!pseudo || depth == last_depth_) {
    115         nextpos++;
    116       }
    117       size_t size = nextpos - lastpos;
    118       if (size) {
    119         std::string chunk = text.substr(lastpos, size).to_string();
    120         if (pseudo) {
    121           chunk = impl_->Text(chunk);
    122         } else if (str[lastpos] == kArgStart && str[nextpos - 1] == kArgEnd) {
    123           chunk = impl_->Placeholder(chunk);
    124         }
    125         out.append(chunk);
    126       }
    127       if (pseudo && depth < last_depth_) {  // End of message
    128         out.append(impl_->End());
    129       } else if (!pseudo && depth > last_depth_) {  // Start of message
    130         out.append(impl_->Start());
    131       }
    132       lastpos = nextpos;
    133       last_depth_ = depth;
    134     }
    135   }
    136   return out;
    137 }
    138 
    139 static const char* PseudolocalizeChar(const char c) {
    140   switch (c) {
    141     case 'a':
    142       return "\u00e5";
    143     case 'b':
    144       return "\u0253";
    145     case 'c':
    146       return "\u00e7";
    147     case 'd':
    148       return "\u00f0";
    149     case 'e':
    150       return "\u00e9";
    151     case 'f':
    152       return "\u0192";
    153     case 'g':
    154       return "\u011d";
    155     case 'h':
    156       return "\u0125";
    157     case 'i':
    158       return "\u00ee";
    159     case 'j':
    160       return "\u0135";
    161     case 'k':
    162       return "\u0137";
    163     case 'l':
    164       return "\u013c";
    165     case 'm':
    166       return "\u1e3f";
    167     case 'n':
    168       return "\u00f1";
    169     case 'o':
    170       return "\u00f6";
    171     case 'p':
    172       return "\u00fe";
    173     case 'q':
    174       return "\u0051";
    175     case 'r':
    176       return "\u0155";
    177     case 's':
    178       return "\u0161";
    179     case 't':
    180       return "\u0163";
    181     case 'u':
    182       return "\u00fb";
    183     case 'v':
    184       return "\u0056";
    185     case 'w':
    186       return "\u0175";
    187     case 'x':
    188       return "\u0445";
    189     case 'y':
    190       return "\u00fd";
    191     case 'z':
    192       return "\u017e";
    193     case 'A':
    194       return "\u00c5";
    195     case 'B':
    196       return "\u03b2";
    197     case 'C':
    198       return "\u00c7";
    199     case 'D':
    200       return "\u00d0";
    201     case 'E':
    202       return "\u00c9";
    203     case 'G':
    204       return "\u011c";
    205     case 'H':
    206       return "\u0124";
    207     case 'I':
    208       return "\u00ce";
    209     case 'J':
    210       return "\u0134";
    211     case 'K':
    212       return "\u0136";
    213     case 'L':
    214       return "\u013b";
    215     case 'M':
    216       return "\u1e3e";
    217     case 'N':
    218       return "\u00d1";
    219     case 'O':
    220       return "\u00d6";
    221     case 'P':
    222       return "\u00de";
    223     case 'Q':
    224       return "\u0071";
    225     case 'R':
    226       return "\u0154";
    227     case 'S':
    228       return "\u0160";
    229     case 'T':
    230       return "\u0162";
    231     case 'U':
    232       return "\u00db";
    233     case 'V':
    234       return "\u03bd";
    235     case 'W':
    236       return "\u0174";
    237     case 'X':
    238       return "\u00d7";
    239     case 'Y':
    240       return "\u00dd";
    241     case 'Z':
    242       return "\u017d";
    243     case '!':
    244       return "\u00a1";
    245     case '?':
    246       return "\u00bf";
    247     case '$':
    248       return "\u20ac";
    249     default:
    250       return nullptr;
    251   }
    252 }
    253 
    254 static bool IsPossibleNormalPlaceholderEnd(const char c) {
    255   switch (c) {
    256     case 's':
    257       return true;
    258     case 'S':
    259       return true;
    260     case 'c':
    261       return true;
    262     case 'C':
    263       return true;
    264     case 'd':
    265       return true;
    266     case 'o':
    267       return true;
    268     case 'x':
    269       return true;
    270     case 'X':
    271       return true;
    272     case 'f':
    273       return true;
    274     case 'e':
    275       return true;
    276     case 'E':
    277       return true;
    278     case 'g':
    279       return true;
    280     case 'G':
    281       return true;
    282     case 'a':
    283       return true;
    284     case 'A':
    285       return true;
    286     case 'b':
    287       return true;
    288     case 'B':
    289       return true;
    290     case 'h':
    291       return true;
    292     case 'H':
    293       return true;
    294     case '%':
    295       return true;
    296     case 'n':
    297       return true;
    298     default:
    299       return false;
    300   }
    301 }
    302 
    303 static std::string PseudoGenerateExpansion(const unsigned int length) {
    304   std::string result = kExpansionString;
    305   const char* s = result.data();
    306   if (result.size() < length) {
    307     result += " ";
    308     result += PseudoGenerateExpansion(length - result.size());
    309   } else {
    310     int ext = 0;
    311     // Should contain only whole words, so looking for a space
    312     for (unsigned int i = length + 1; i < result.size(); ++i) {
    313       ++ext;
    314       if (s[i] == ' ') {
    315         break;
    316       }
    317     }
    318     result = result.substr(0, length + ext);
    319   }
    320   return result;
    321 }
    322 
    323 std::string PseudoMethodAccent::Start() {
    324   std::string result;
    325   if (depth_ == 0) {
    326     result = "[";
    327   }
    328   word_count_ = length_ = 0;
    329   depth_++;
    330   return result;
    331 }
    332 
    333 std::string PseudoMethodAccent::End() {
    334   std::string result;
    335   if (length_) {
    336     result += " ";
    337     result += PseudoGenerateExpansion(word_count_ > 3 ? length_ : length_ / 2);
    338   }
    339   word_count_ = length_ = 0;
    340   depth_--;
    341   if (depth_ == 0) {
    342     result += "]";
    343   }
    344   return result;
    345 }
    346 
    347 /**
    348  * Converts characters so they look like they've been localized.
    349  *
    350  * Note: This leaves placeholder syntax untouched.
    351  */
    352 std::string PseudoMethodAccent::Text(const StringPiece& source) {
    353   const char* s = source.data();
    354   std::string result;
    355   const size_t I = source.size();
    356   bool lastspace = true;
    357   for (size_t i = 0; i < I; i++) {
    358     char c = s[i];
    359     if (c == '%') {
    360       // Placeholder syntax, no need to pseudolocalize
    361       std::string chunk;
    362       bool end = false;
    363       chunk.append(&c, 1);
    364       while (!end && i + 1 < I) {
    365         ++i;
    366         c = s[i];
    367         chunk.append(&c, 1);
    368         if (IsPossibleNormalPlaceholderEnd(c)) {
    369           end = true;
    370         } else if (i + 1 < I && c == 't') {
    371           ++i;
    372           c = s[i];
    373           chunk.append(&c, 1);
    374           end = true;
    375         }
    376       }
    377       // Treat chunk as a placeholder unless it ends with %.
    378       result += ((c == '%') ? chunk : Placeholder(chunk));
    379     } else if (c == '<' || c == '&') {
    380       // html syntax, no need to pseudolocalize
    381       bool tag_closed = false;
    382       while (!tag_closed && i < I) {
    383         if (c == '&') {
    384           std::string escape_text;
    385           escape_text.append(&c, 1);
    386           bool end = false;
    387           size_t html_code_pos = i;
    388           while (!end && html_code_pos < I) {
    389             ++html_code_pos;
    390             c = s[html_code_pos];
    391             escape_text.append(&c, 1);
    392             // Valid html code
    393             if (c == ';') {
    394               end = true;
    395               i = html_code_pos;
    396             }
    397             // Wrong html code
    398             else if (!((c == '#' || (c >= 'a' && c <= 'z') ||
    399                         (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')))) {
    400               end = true;
    401             }
    402           }
    403           result += escape_text;
    404           if (escape_text != "&lt;") {
    405             tag_closed = true;
    406           }
    407           continue;
    408         }
    409         if (c == '>') {
    410           tag_closed = true;
    411           result.append(&c, 1);
    412           continue;
    413         }
    414         result.append(&c, 1);
    415         i++;
    416         c = s[i];
    417       }
    418     } else {
    419       // This is a pure text that should be pseudolocalized
    420       const char* p = PseudolocalizeChar(c);
    421       if (p != nullptr) {
    422         result += p;
    423       } else {
    424         bool space = isspace(c);
    425         if (lastspace && !space) {
    426           word_count_++;
    427         }
    428         lastspace = space;
    429         result.append(&c, 1);
    430       }
    431       // Count only pseudolocalizable chars and delimiters
    432       length_++;
    433     }
    434   }
    435   return result;
    436 }
    437 
    438 std::string PseudoMethodAccent::Placeholder(const StringPiece& source) {
    439   // Surround a placeholder with brackets
    440   return kPlaceholderOpen + source.to_string() + kPlaceholderClose;
    441 }
    442 
    443 std::string PseudoMethodBidi::Text(const StringPiece& source) {
    444   const char* s = source.data();
    445   std::string result;
    446   bool lastspace = true;
    447   bool space = true;
    448   bool escape = false;
    449   const char ESCAPE_CHAR = '\\';
    450   for (size_t i = 0; i < source.size(); i++) {
    451     char c = s[i];
    452     if (!escape && c == ESCAPE_CHAR) {
    453       escape = true;
    454       continue;
    455     }
    456     space = (!escape && isspace(c)) || (escape && (c == 'n' || c == 't'));
    457     if (lastspace && !space) {
    458       // Word start
    459       result += kRlm + kRlo;
    460     } else if (!lastspace && space) {
    461       // Word end
    462       result += kPdf + kRlm;
    463     }
    464     lastspace = space;
    465     if (escape) {
    466       result.append(&ESCAPE_CHAR, 1);
    467       escape=false;
    468     }
    469     result.append(&c, 1);
    470   }
    471   if (!lastspace) {
    472     // End of last word
    473     result += kPdf + kRlm;
    474   }
    475   return result;
    476 }
    477 
    478 std::string PseudoMethodBidi::Placeholder(const StringPiece& source) {
    479   // Surround a placeholder with directionality change sequence
    480   return kRlm + kRlo + source.to_string() + kPdf + kRlm;
    481 }
    482 
    483 }  // namespace aapt
    484