Home | History | Annotate | Download | only in compile
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "compile/Pseudolocalizer.h"
     18 #include "util/Util.h"
     19 
     20 namespace aapt {
     21 
     22 // String basis to generate expansion
     23 static const std::u16string k_expansion_string = u"one two three "
     24         "four five six seven eight nine ten eleven twelve thirteen "
     25         "fourteen fiveteen sixteen seventeen nineteen twenty";
     26 
     27 // Special unicode characters to override directionality of the words
     28 static const std::u16string k_rlm = u"\u200f";
     29 static const std::u16string k_rlo = u"\u202e";
     30 static const std::u16string k_pdf = u"\u202c";
     31 
     32 // Placeholder marks
     33 static const std::u16string k_placeholder_open = u"\u00bb";
     34 static const std::u16string k_placeholder_close = u"\u00ab";
     35 
     36 static const char16_t k_arg_start = u'{';
     37 static const char16_t k_arg_end = u'}';
     38 
     39 class PseudoMethodNone : public PseudoMethodImpl {
     40 public:
     41     std::u16string text(const StringPiece16& text) override { return text.toString(); }
     42     std::u16string placeholder(const StringPiece16& text) override { return text.toString(); }
     43 };
     44 
     45 class PseudoMethodBidi : public PseudoMethodImpl {
     46 public:
     47     std::u16string text(const StringPiece16& text) override;
     48     std::u16string placeholder(const StringPiece16& text) override;
     49 };
     50 
     51 class PseudoMethodAccent : public PseudoMethodImpl {
     52 public:
     53     PseudoMethodAccent() : mDepth(0), mWordCount(0), mLength(0) {}
     54     std::u16string start() override;
     55     std::u16string end() override;
     56     std::u16string text(const StringPiece16& text) override;
     57     std::u16string placeholder(const StringPiece16& text) override;
     58 private:
     59     size_t mDepth;
     60     size_t mWordCount;
     61     size_t mLength;
     62 };
     63 
     64 Pseudolocalizer::Pseudolocalizer(Method method) : mLastDepth(0) {
     65     setMethod(method);
     66 }
     67 
     68 void Pseudolocalizer::setMethod(Method method) {
     69     switch (method) {
     70     case Method::kNone:
     71         mImpl = util::make_unique<PseudoMethodNone>();
     72         break;
     73     case Method::kAccent:
     74         mImpl = util::make_unique<PseudoMethodAccent>();
     75         break;
     76     case Method::kBidi:
     77         mImpl = util::make_unique<PseudoMethodBidi>();
     78         break;
     79     }
     80 }
     81 
     82 std::u16string Pseudolocalizer::text(const StringPiece16& text) {
     83     std::u16string out;
     84     size_t depth = mLastDepth;
     85     size_t lastpos, pos;
     86     const size_t length = text.size();
     87     const char16_t* str = text.data();
     88     bool escaped = false;
     89     for (lastpos = pos = 0; pos < length; pos++) {
     90         char16_t c = str[pos];
     91         if (escaped) {
     92             escaped = false;
     93             continue;
     94         }
     95         if (c == '\'') {
     96             escaped = true;
     97             continue;
     98         }
     99 
    100         if (c == k_arg_start) {
    101             depth++;
    102         } else if (c == k_arg_end && depth) {
    103             depth--;
    104         }
    105 
    106         if (mLastDepth != depth || pos == length - 1) {
    107             bool pseudo = ((mLastDepth % 2) == 0);
    108             size_t nextpos = pos;
    109             if (!pseudo || depth == mLastDepth) {
    110                 nextpos++;
    111             }
    112             size_t size = nextpos - lastpos;
    113             if (size) {
    114                 std::u16string chunk = text.substr(lastpos, size).toString();
    115                 if (pseudo) {
    116                     chunk = mImpl->text(chunk);
    117                 } else if (str[lastpos] == k_arg_start && str[nextpos - 1] == k_arg_end) {
    118                     chunk = mImpl->placeholder(chunk);
    119                 }
    120                 out.append(chunk);
    121             }
    122             if (pseudo && depth < mLastDepth) { // End of message
    123                 out.append(mImpl->end());
    124             } else if (!pseudo && depth > mLastDepth) { // Start of message
    125                 out.append(mImpl->start());
    126             }
    127             lastpos = nextpos;
    128             mLastDepth = depth;
    129         }
    130     }
    131     return out;
    132 }
    133 
    134 static const char16_t* pseudolocalizeChar(const char16_t c) {
    135     switch (c) {
    136         case 'a':   return u"\u00e5";
    137         case 'b':   return u"\u0253";
    138         case 'c':   return u"\u00e7";
    139         case 'd':   return u"\u00f0";
    140         case 'e':   return u"\u00e9";
    141         case 'f':   return u"\u0192";
    142         case 'g':   return u"\u011d";
    143         case 'h':   return u"\u0125";
    144         case 'i':   return u"\u00ee";
    145         case 'j':   return u"\u0135";
    146         case 'k':   return u"\u0137";
    147         case 'l':   return u"\u013c";
    148         case 'm':   return u"\u1e3f";
    149         case 'n':   return u"\u00f1";
    150         case 'o':   return u"\u00f6";
    151         case 'p':   return u"\u00fe";
    152         case 'q':   return u"\u0051";
    153         case 'r':   return u"\u0155";
    154         case 's':   return u"\u0161";
    155         case 't':   return u"\u0163";
    156         case 'u':   return u"\u00fb";
    157         case 'v':   return u"\u0056";
    158         case 'w':   return u"\u0175";
    159         case 'x':   return u"\u0445";
    160         case 'y':   return u"\u00fd";
    161         case 'z':   return u"\u017e";
    162         case 'A':   return u"\u00c5";
    163         case 'B':   return u"\u03b2";
    164         case 'C':   return u"\u00c7";
    165         case 'D':   return u"\u00d0";
    166         case 'E':   return u"\u00c9";
    167         case 'G':   return u"\u011c";
    168         case 'H':   return u"\u0124";
    169         case 'I':   return u"\u00ce";
    170         case 'J':   return u"\u0134";
    171         case 'K':   return u"\u0136";
    172         case 'L':   return u"\u013b";
    173         case 'M':   return u"\u1e3e";
    174         case 'N':   return u"\u00d1";
    175         case 'O':   return u"\u00d6";
    176         case 'P':   return u"\u00de";
    177         case 'Q':   return u"\u0071";
    178         case 'R':   return u"\u0154";
    179         case 'S':   return u"\u0160";
    180         case 'T':   return u"\u0162";
    181         case 'U':   return u"\u00db";
    182         case 'V':   return u"\u03bd";
    183         case 'W':   return u"\u0174";
    184         case 'X':   return u"\u00d7";
    185         case 'Y':   return u"\u00dd";
    186         case 'Z':   return u"\u017d";
    187         case '!':   return u"\u00a1";
    188         case '?':   return u"\u00bf";
    189         case '$':   return u"\u20ac";
    190         default:    return NULL;
    191     }
    192 }
    193 
    194 static bool isPossibleNormalPlaceholderEnd(const char16_t c) {
    195     switch (c) {
    196         case 's': return true;
    197         case 'S': return true;
    198         case 'c': return true;
    199         case 'C': return true;
    200         case 'd': return true;
    201         case 'o': return true;
    202         case 'x': return true;
    203         case 'X': return true;
    204         case 'f': return true;
    205         case 'e': return true;
    206         case 'E': return true;
    207         case 'g': return true;
    208         case 'G': return true;
    209         case 'a': return true;
    210         case 'A': return true;
    211         case 'b': return true;
    212         case 'B': return true;
    213         case 'h': return true;
    214         case 'H': return true;
    215         case '%': return true;
    216         case 'n': return true;
    217         default:  return false;
    218     }
    219 }
    220 
    221 static std::u16string pseudoGenerateExpansion(const unsigned int length) {
    222     std::u16string result = k_expansion_string;
    223     const char16_t* s = result.data();
    224     if (result.size() < length) {
    225         result += u" ";
    226         result += pseudoGenerateExpansion(length - result.size());
    227     } else {
    228         int ext = 0;
    229         // Should contain only whole words, so looking for a space
    230         for (unsigned int i = length + 1; i < result.size(); ++i) {
    231             ++ext;
    232             if (s[i] == ' ') {
    233                 break;
    234             }
    235         }
    236         result = result.substr(0, length + ext);
    237     }
    238     return result;
    239 }
    240 
    241 std::u16string PseudoMethodAccent::start() {
    242     std::u16string result;
    243     if (mDepth == 0) {
    244         result = u"[";
    245     }
    246     mWordCount = mLength = 0;
    247     mDepth++;
    248     return result;
    249 }
    250 
    251 std::u16string PseudoMethodAccent::end() {
    252     std::u16string result;
    253     if (mLength) {
    254         result += u" ";
    255         result += pseudoGenerateExpansion(mWordCount > 3 ? mLength : mLength / 2);
    256     }
    257     mWordCount = mLength = 0;
    258     mDepth--;
    259     if (mDepth == 0) {
    260         result += u"]";
    261     }
    262     return result;
    263 }
    264 
    265 /**
    266  * Converts characters so they look like they've been localized.
    267  *
    268  * Note: This leaves placeholder syntax untouched.
    269  */
    270 std::u16string PseudoMethodAccent::text(const StringPiece16& source)
    271 {
    272     const char16_t* s = source.data();
    273     std::u16string result;
    274     const size_t I = source.size();
    275     bool lastspace = true;
    276     for (size_t i = 0; i < I; i++) {
    277         char16_t c = s[i];
    278         if (c == '%') {
    279             // Placeholder syntax, no need to pseudolocalize
    280             std::u16string chunk;
    281             bool end = false;
    282             chunk.append(&c, 1);
    283             while (!end && i < I) {
    284                 ++i;
    285                 c = s[i];
    286                 chunk.append(&c, 1);
    287                 if (isPossibleNormalPlaceholderEnd(c)) {
    288                     end = true;
    289                 } else if (c == 't') {
    290                     ++i;
    291                     c = s[i];
    292                     chunk.append(&c, 1);
    293                     end = true;
    294                 }
    295             }
    296             // Treat chunk as a placeholder unless it ends with %.
    297             result += ((c == '%') ? chunk : placeholder(chunk));
    298         } else if (c == '<' || c == '&') {
    299             // html syntax, no need to pseudolocalize
    300             bool tag_closed = false;
    301             while (!tag_closed && i < I) {
    302                 if (c == '&') {
    303                     std::u16string escapeText;
    304                     escapeText.append(&c, 1);
    305                     bool end = false;
    306                     size_t htmlCodePos = i;
    307                     while (!end && htmlCodePos < I) {
    308                         ++htmlCodePos;
    309                         c = s[htmlCodePos];
    310                         escapeText.append(&c, 1);
    311                         // Valid html code
    312                         if (c == ';') {
    313                             end = true;
    314                             i = htmlCodePos;
    315                         }
    316                         // Wrong html code
    317                         else if (!((c == '#' ||
    318                                  (c >= 'a' && c <= 'z') ||
    319                                  (c >= 'A' && c <= 'Z') ||
    320                                  (c >= '0' && c <= '9')))) {
    321                             end = true;
    322                         }
    323                     }
    324                     result += escapeText;
    325                     if (escapeText != u"&lt;") {
    326                         tag_closed = true;
    327                     }
    328                     continue;
    329                 }
    330                 if (c == '>') {
    331                     tag_closed = true;
    332                     result.append(&c, 1);
    333                     continue;
    334                 }
    335                 result.append(&c, 1);
    336                 i++;
    337                 c = s[i];
    338             }
    339         } else {
    340             // This is a pure text that should be pseudolocalized
    341             const char16_t* p = pseudolocalizeChar(c);
    342             if (p != nullptr) {
    343                 result += p;
    344             } else {
    345                 bool space = util::isspace16(c);
    346                 if (lastspace && !space) {
    347                     mWordCount++;
    348                 }
    349                 lastspace = space;
    350                 result.append(&c, 1);
    351             }
    352             // Count only pseudolocalizable chars and delimiters
    353             mLength++;
    354         }
    355     }
    356     return result;
    357 }
    358 
    359 std::u16string PseudoMethodAccent::placeholder(const StringPiece16& source) {
    360     // Surround a placeholder with brackets
    361     return k_placeholder_open + source.toString() + k_placeholder_close;
    362 }
    363 
    364 std::u16string PseudoMethodBidi::text(const StringPiece16& source) {
    365     const char16_t* s = source.data();
    366     std::u16string result;
    367     bool lastspace = true;
    368     bool space = true;
    369     for (size_t i = 0; i < source.size(); i++) {
    370         char16_t c = s[i];
    371         space = util::isspace16(c);
    372         if (lastspace && !space) {
    373             // Word start
    374             result += k_rlm + k_rlo;
    375         } else if (!lastspace && space) {
    376             // Word end
    377             result += k_pdf + k_rlm;
    378         }
    379         lastspace = space;
    380         result.append(&c, 1);
    381     }
    382     if (!lastspace) {
    383         // End of last word
    384         result += k_pdf + k_rlm;
    385     }
    386     return result;
    387 }
    388 
    389 std::u16string PseudoMethodBidi::placeholder(const StringPiece16& source) {
    390     // Surround a placeholder with directionality change sequence
    391     return k_rlm + k_rlo + source.toString() + k_pdf + k_rlm;
    392 }
    393 
    394 } // namespace aapt
    395