Home | History | Annotate | Download | only in i18n
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/i18n/rtl.h"
      6 
      7 #include "base/files/file_path.h"
      8 #include "base/logging.h"
      9 #include "base/strings/string_util.h"
     10 #include "base/strings/sys_string_conversions.h"
     11 #include "base/strings/utf_string_conversions.h"
     12 #include "third_party/icu/source/common/unicode/locid.h"
     13 #include "third_party/icu/source/common/unicode/uchar.h"
     14 #include "third_party/icu/source/common/unicode/uscript.h"
     15 #include "third_party/icu/source/i18n/unicode/coll.h"
     16 
     17 namespace {
     18 
     19 // Extract language, country and variant, but ignore keywords.  For example,
     20 // en-US, ca@valencia, ca-ES@valencia.
     21 std::string GetLocaleString(const icu::Locale& locale) {
     22   const char* language = locale.getLanguage();
     23   const char* country = locale.getCountry();
     24   const char* variant = locale.getVariant();
     25 
     26   std::string result =
     27       (language != NULL && *language != '\0') ? language : "und";
     28 
     29   if (country != NULL && *country != '\0') {
     30     result += '-';
     31     result += country;
     32   }
     33 
     34   if (variant != NULL && *variant != '\0') {
     35     std::string variant_str(variant);
     36     StringToLowerASCII(&variant_str);
     37     result += '@' + variant_str;
     38   }
     39 
     40   return result;
     41 }
     42 
     43 // Returns LEFT_TO_RIGHT or RIGHT_TO_LEFT if |character| has strong
     44 // directionality, returns UNKNOWN_DIRECTION if it doesn't. Please refer to
     45 // http://unicode.org/reports/tr9/ for more information.
     46 base::i18n::TextDirection GetCharacterDirection(UChar32 character) {
     47   // Now that we have the character, we use ICU in order to query for the
     48   // appropriate Unicode BiDi character type.
     49   int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
     50   if ((property == U_RIGHT_TO_LEFT) ||
     51       (property == U_RIGHT_TO_LEFT_ARABIC) ||
     52       (property == U_RIGHT_TO_LEFT_EMBEDDING) ||
     53       (property == U_RIGHT_TO_LEFT_OVERRIDE)) {
     54     return base::i18n::RIGHT_TO_LEFT;
     55   } else if ((property == U_LEFT_TO_RIGHT) ||
     56              (property == U_LEFT_TO_RIGHT_EMBEDDING) ||
     57              (property == U_LEFT_TO_RIGHT_OVERRIDE)) {
     58     return base::i18n::LEFT_TO_RIGHT;
     59   }
     60   return base::i18n::UNKNOWN_DIRECTION;
     61 }
     62 
     63 }  // namespace
     64 
     65 namespace base {
     66 namespace i18n {
     67 
     68 // Represents the locale-specific ICU text direction.
     69 static TextDirection g_icu_text_direction = UNKNOWN_DIRECTION;
     70 
     71 // Convert the ICU default locale to a string.
     72 std::string GetConfiguredLocale() {
     73   return GetLocaleString(icu::Locale::getDefault());
     74 }
     75 
     76 // Convert the ICU canonicalized locale to a string.
     77 std::string GetCanonicalLocale(const char* locale) {
     78   return GetLocaleString(icu::Locale::createCanonical(locale));
     79 }
     80 
     81 // Convert Chrome locale name to ICU locale name
     82 std::string ICULocaleName(const std::string& locale_string) {
     83   // If not Spanish, just return it.
     84   if (locale_string.substr(0, 2) != "es")
     85     return locale_string;
     86   // Expand es to es-ES.
     87   if (LowerCaseEqualsASCII(locale_string, "es"))
     88     return "es-ES";
     89   // Map es-419 (Latin American Spanish) to es-FOO depending on the system
     90   // locale.  If it's es-RR other than es-ES, map to es-RR. Otherwise, map
     91   // to es-MX (the most populous in Spanish-speaking Latin America).
     92   if (LowerCaseEqualsASCII(locale_string, "es-419")) {
     93     const icu::Locale& locale = icu::Locale::getDefault();
     94     std::string language = locale.getLanguage();
     95     const char* country = locale.getCountry();
     96     if (LowerCaseEqualsASCII(language, "es") &&
     97       !LowerCaseEqualsASCII(country, "es")) {
     98         language += '-';
     99         language += country;
    100         return language;
    101     }
    102     return "es-MX";
    103   }
    104   // Currently, Chrome has only "es" and "es-419", but later we may have
    105   // more specific "es-RR".
    106   return locale_string;
    107 }
    108 
    109 void SetICUDefaultLocale(const std::string& locale_string) {
    110   icu::Locale locale(ICULocaleName(locale_string).c_str());
    111   UErrorCode error_code = U_ZERO_ERROR;
    112   icu::Locale::setDefault(locale, error_code);
    113   // This return value is actually bogus because Locale object is
    114   // an ID and setDefault seems to always succeed (regardless of the
    115   // presence of actual locale data). However,
    116   // it does not hurt to have it as a sanity check.
    117   DCHECK(U_SUCCESS(error_code));
    118   g_icu_text_direction = UNKNOWN_DIRECTION;
    119 }
    120 
    121 bool IsRTL() {
    122   return ICUIsRTL();
    123 }
    124 
    125 bool ICUIsRTL() {
    126   if (g_icu_text_direction == UNKNOWN_DIRECTION) {
    127     const icu::Locale& locale = icu::Locale::getDefault();
    128     g_icu_text_direction = GetTextDirectionForLocale(locale.getName());
    129   }
    130   return g_icu_text_direction == RIGHT_TO_LEFT;
    131 }
    132 
    133 TextDirection GetTextDirectionForLocale(const char* locale_name) {
    134   UErrorCode status = U_ZERO_ERROR;
    135   ULayoutType layout_dir = uloc_getCharacterOrientation(locale_name, &status);
    136   DCHECK(U_SUCCESS(status));
    137   // Treat anything other than RTL as LTR.
    138   return (layout_dir != ULOC_LAYOUT_RTL) ? LEFT_TO_RIGHT : RIGHT_TO_LEFT;
    139 }
    140 
    141 TextDirection GetFirstStrongCharacterDirection(const string16& text) {
    142   const UChar* string = text.c_str();
    143   size_t length = text.length();
    144   size_t position = 0;
    145   while (position < length) {
    146     UChar32 character;
    147     size_t next_position = position;
    148     U16_NEXT(string, next_position, length, character);
    149     TextDirection direction = GetCharacterDirection(character);
    150     if (direction != UNKNOWN_DIRECTION)
    151       return direction;
    152     position = next_position;
    153   }
    154   return LEFT_TO_RIGHT;
    155 }
    156 
    157 TextDirection GetLastStrongCharacterDirection(const string16& text) {
    158   const UChar* string = text.c_str();
    159   size_t position = text.length();
    160   while (position > 0) {
    161     UChar32 character;
    162     size_t prev_position = position;
    163     U16_PREV(string, 0, prev_position, character);
    164     TextDirection direction = GetCharacterDirection(character);
    165     if (direction != UNKNOWN_DIRECTION)
    166       return direction;
    167     position = prev_position;
    168   }
    169   return LEFT_TO_RIGHT;
    170 }
    171 
    172 TextDirection GetStringDirection(const string16& text) {
    173   const UChar* string = text.c_str();
    174   size_t length = text.length();
    175   size_t position = 0;
    176 
    177   TextDirection result(UNKNOWN_DIRECTION);
    178   while (position < length) {
    179     UChar32 character;
    180     size_t next_position = position;
    181     U16_NEXT(string, next_position, length, character);
    182     TextDirection direction = GetCharacterDirection(character);
    183     if (direction != UNKNOWN_DIRECTION) {
    184       if (result != UNKNOWN_DIRECTION && result != direction)
    185         return UNKNOWN_DIRECTION;
    186       result = direction;
    187     }
    188     position = next_position;
    189   }
    190 
    191   // Handle the case of a string not containing any strong directionality
    192   // characters defaulting to LEFT_TO_RIGHT.
    193   if (result == UNKNOWN_DIRECTION)
    194     return LEFT_TO_RIGHT;
    195 
    196   return result;
    197 }
    198 
    199 #if defined(OS_WIN)
    200 bool AdjustStringForLocaleDirection(string16* text) {
    201   if (!IsRTL() || text->empty())
    202     return false;
    203 
    204   // Marking the string as LTR if the locale is RTL and the string does not
    205   // contain strong RTL characters. Otherwise, mark the string as RTL.
    206   bool has_rtl_chars = StringContainsStrongRTLChars(*text);
    207   if (!has_rtl_chars)
    208     WrapStringWithLTRFormatting(text);
    209   else
    210     WrapStringWithRTLFormatting(text);
    211 
    212   return true;
    213 }
    214 
    215 bool UnadjustStringForLocaleDirection(string16* text) {
    216   if (!IsRTL() || text->empty())
    217     return false;
    218 
    219   *text = StripWrappingBidiControlCharacters(*text);
    220   return true;
    221 }
    222 #else
    223 bool AdjustStringForLocaleDirection(string16* text) {
    224   // On OS X & GTK the directionality of a label is determined by the first
    225   // strongly directional character.
    226   // However, we want to make sure that in an LTR-language-UI all strings are
    227   // left aligned and vice versa.
    228   // A problem can arise if we display a string which starts with user input.
    229   // User input may be of the opposite directionality to the UI. So the whole
    230   // string will be displayed in the opposite directionality, e.g. if we want to
    231   // display in an LTR UI [such as US English]:
    232   //
    233   // EMAN_NOISNETXE is now installed.
    234   //
    235   // Since EXTENSION_NAME begins with a strong RTL char, the label's
    236   // directionality will be set to RTL and the string will be displayed visually
    237   // as:
    238   //
    239   // .is now installed EMAN_NOISNETXE
    240   //
    241   // In order to solve this issue, we prepend an LRM to the string. An LRM is a
    242   // strongly directional LTR char.
    243   // We also append an LRM at the end, which ensures that we're in an LTR
    244   // context.
    245 
    246   // Unlike Windows, Linux and OS X can correctly display RTL glyphs out of the
    247   // box so there is no issue with displaying zero-width bidi control characters
    248   // on any system.  Thus no need for the !IsRTL() check here.
    249   if (text->empty())
    250     return false;
    251 
    252   bool ui_direction_is_rtl = IsRTL();
    253 
    254   bool has_rtl_chars = StringContainsStrongRTLChars(*text);
    255   if (!ui_direction_is_rtl && has_rtl_chars) {
    256     WrapStringWithRTLFormatting(text);
    257     text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
    258                  kLeftToRightMark);
    259     text->push_back(kLeftToRightMark);
    260   } else if (ui_direction_is_rtl && has_rtl_chars) {
    261     WrapStringWithRTLFormatting(text);
    262     text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
    263                  kRightToLeftMark);
    264     text->push_back(kRightToLeftMark);
    265   } else if (ui_direction_is_rtl) {
    266     WrapStringWithLTRFormatting(text);
    267     text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
    268                  kRightToLeftMark);
    269     text->push_back(kRightToLeftMark);
    270   } else {
    271     return false;
    272   }
    273 
    274   return true;
    275 }
    276 
    277 bool UnadjustStringForLocaleDirection(string16* text) {
    278   if (text->empty())
    279     return false;
    280 
    281   size_t begin_index = 0;
    282   char16 begin = text->at(begin_index);
    283   if (begin == kLeftToRightMark ||
    284       begin == kRightToLeftMark) {
    285     ++begin_index;
    286   }
    287 
    288   size_t end_index = text->length() - 1;
    289   char16 end = text->at(end_index);
    290   if (end == kLeftToRightMark ||
    291       end == kRightToLeftMark) {
    292     --end_index;
    293   }
    294 
    295   string16 unmarked_text =
    296       text->substr(begin_index, end_index - begin_index + 1);
    297   *text = StripWrappingBidiControlCharacters(unmarked_text);
    298   return true;
    299 }
    300 
    301 #endif  // !OS_WIN
    302 
    303 bool StringContainsStrongRTLChars(const string16& text) {
    304   const UChar* string = text.c_str();
    305   size_t length = text.length();
    306   size_t position = 0;
    307   while (position < length) {
    308     UChar32 character;
    309     size_t next_position = position;
    310     U16_NEXT(string, next_position, length, character);
    311 
    312     // Now that we have the character, we use ICU in order to query for the
    313     // appropriate Unicode BiDi character type.
    314     int32_t property = u_getIntPropertyValue(character, UCHAR_BIDI_CLASS);
    315     if ((property == U_RIGHT_TO_LEFT) || (property == U_RIGHT_TO_LEFT_ARABIC))
    316       return true;
    317 
    318     position = next_position;
    319   }
    320 
    321   return false;
    322 }
    323 
    324 void WrapStringWithLTRFormatting(string16* text) {
    325   if (text->empty())
    326     return;
    327 
    328   // Inserting an LRE (Left-To-Right Embedding) mark as the first character.
    329   text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
    330                kLeftToRightEmbeddingMark);
    331 
    332   // Inserting a PDF (Pop Directional Formatting) mark as the last character.
    333   text->push_back(kPopDirectionalFormatting);
    334 }
    335 
    336 void WrapStringWithRTLFormatting(string16* text) {
    337   if (text->empty())
    338     return;
    339 
    340   // Inserting an RLE (Right-To-Left Embedding) mark as the first character.
    341   text->insert(static_cast<size_t>(0), static_cast<size_t>(1),
    342                kRightToLeftEmbeddingMark);
    343 
    344   // Inserting a PDF (Pop Directional Formatting) mark as the last character.
    345   text->push_back(kPopDirectionalFormatting);
    346 }
    347 
    348 void WrapPathWithLTRFormatting(const FilePath& path,
    349                                string16* rtl_safe_path) {
    350   // Wrap the overall path with LRE-PDF pair which essentialy marks the
    351   // string as a Left-To-Right string.
    352   // Inserting an LRE (Left-To-Right Embedding) mark as the first character.
    353   rtl_safe_path->push_back(kLeftToRightEmbeddingMark);
    354 #if defined(OS_MACOSX)
    355     rtl_safe_path->append(UTF8ToUTF16(path.value()));
    356 #elif defined(OS_WIN)
    357     rtl_safe_path->append(path.value());
    358 #else  // defined(OS_POSIX) && !defined(OS_MACOSX)
    359     std::wstring wide_path = base::SysNativeMBToWide(path.value());
    360     rtl_safe_path->append(WideToUTF16(wide_path));
    361 #endif
    362   // Inserting a PDF (Pop Directional Formatting) mark as the last character.
    363   rtl_safe_path->push_back(kPopDirectionalFormatting);
    364 }
    365 
    366 string16 GetDisplayStringInLTRDirectionality(const string16& text) {
    367   // Always wrap the string in RTL UI (it may be appended to RTL string).
    368   // Also wrap strings with an RTL first strong character direction in LTR UI.
    369   if (IsRTL() || GetFirstStrongCharacterDirection(text) == RIGHT_TO_LEFT) {
    370     string16 text_mutable(text);
    371     WrapStringWithLTRFormatting(&text_mutable);
    372     return text_mutable;
    373   }
    374   return text;
    375 }
    376 
    377 string16 StripWrappingBidiControlCharacters(const string16& text) {
    378   if (text.empty())
    379     return text;
    380   size_t begin_index = 0;
    381   char16 begin = text[begin_index];
    382   if (begin == kLeftToRightEmbeddingMark ||
    383       begin == kRightToLeftEmbeddingMark ||
    384       begin == kLeftToRightOverride ||
    385       begin == kRightToLeftOverride)
    386     ++begin_index;
    387   size_t end_index = text.length() - 1;
    388   if (text[end_index] == kPopDirectionalFormatting)
    389     --end_index;
    390   return text.substr(begin_index, end_index - begin_index + 1);
    391 }
    392 
    393 }  // namespace i18n
    394 }  // namespace base
    395