Home | History | Annotate | Download | only in i18n
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // File utilities that use the ICU library go in this file.
      6 
      7 #include "base/i18n/file_util_icu.h"
      8 
      9 #include "base/files/file_path.h"
     10 #include "base/i18n/icu_string_conversions.h"
     11 #include "base/i18n/string_compare.h"
     12 #include "base/logging.h"
     13 #include "base/memory/scoped_ptr.h"
     14 #include "base/memory/singleton.h"
     15 #include "base/strings/string_util.h"
     16 #include "base/strings/sys_string_conversions.h"
     17 #include "base/strings/utf_string_conversions.h"
     18 #include "build/build_config.h"
     19 #include "third_party/icu/source/common/unicode/uniset.h"
     20 #include "third_party/icu/source/i18n/unicode/coll.h"
     21 
     22 using base::string16;
     23 
     24 namespace {
     25 
     26 class IllegalCharacters {
     27  public:
     28   static IllegalCharacters* GetInstance() {
     29     return Singleton<IllegalCharacters>::get();
     30   }
     31 
     32   bool contains(UChar32 ucs4) {
     33     return !!set->contains(ucs4);
     34   }
     35 
     36   bool containsNone(const string16 &s) {
     37     return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size()));
     38   }
     39 
     40  private:
     41   friend class Singleton<IllegalCharacters>;
     42   friend struct DefaultSingletonTraits<IllegalCharacters>;
     43 
     44   IllegalCharacters();
     45   ~IllegalCharacters() { }
     46 
     47   scoped_ptr<icu::UnicodeSet> set;
     48 
     49   DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);
     50 };
     51 
     52 IllegalCharacters::IllegalCharacters() {
     53   UErrorCode status = U_ZERO_ERROR;
     54   // Control characters, formatting characters, non-characters, and
     55   // some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
     56   // See  http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
     57   // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
     58   // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they
     59   // are legitimate in Arabic and some S/SE Asian scripts. However, when used
     60   // elsewhere, they can be confusing/problematic.
     61   // Also, consider wrapping the set with our Singleton class to create and
     62   // freeze it only once. Note that there's a trade-off between memory and
     63   // speed.
     64 #if defined(WCHAR_T_IS_UTF16)
     65   set.reset(new icu::UnicodeSet(icu::UnicodeString(
     66       L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status));
     67 #else
     68   set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE(
     69       "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(),
     70       status));
     71 #endif
     72   DCHECK(U_SUCCESS(status));
     73   // Add non-characters. If this becomes a performance bottleneck by
     74   // any chance, do not add these to |set| and change IsFilenameLegal()
     75   // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling
     76   // containsNone().
     77   set->add(0xFDD0, 0xFDEF);
     78   for (int i = 0; i <= 0x10; ++i) {
     79     int plane_base = 0x10000 * i;
     80     set->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
     81   }
     82   set->freeze();
     83 }
     84 
     85 }  // namespace
     86 
     87 namespace file_util {
     88 
     89 bool IsFilenameLegal(const string16& file_name) {
     90   return IllegalCharacters::GetInstance()->containsNone(file_name);
     91 }
     92 
     93 void ReplaceIllegalCharactersInPath(base::FilePath::StringType* file_name,
     94                                     char replace_char) {
     95   DCHECK(file_name);
     96 
     97   DCHECK(!(IllegalCharacters::GetInstance()->contains(replace_char)));
     98 
     99   // Remove leading and trailing whitespace.
    100   TrimWhitespace(*file_name, TRIM_ALL, file_name);
    101 
    102   IllegalCharacters* illegal = IllegalCharacters::GetInstance();
    103   int cursor = 0;  // The ICU macros expect an int.
    104   while (cursor < static_cast<int>(file_name->size())) {
    105     int char_begin = cursor;
    106     uint32 code_point;
    107 #if defined(OS_MACOSX)
    108     // Mac uses UTF-8 encoding for filenames.
    109     U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
    110             code_point);
    111 #elif defined(OS_WIN)
    112     // Windows uses UTF-16 encoding for filenames.
    113     U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
    114              code_point);
    115 #elif defined(OS_POSIX)
    116     // Linux doesn't actually define an encoding. It basically allows anything
    117     // except for a few special ASCII characters.
    118     unsigned char cur_char = static_cast<unsigned char>((*file_name)[cursor++]);
    119     if (cur_char >= 0x80)
    120       continue;
    121     code_point = cur_char;
    122 #else
    123     NOTREACHED();
    124 #endif
    125 
    126     if (illegal->contains(code_point)) {
    127       file_name->replace(char_begin, cursor - char_begin, 1, replace_char);
    128       // We just made the potentially multi-byte/word char into one that only
    129       // takes one byte/word, so need to adjust the cursor to point to the next
    130       // character again.
    131       cursor = char_begin + 1;
    132     }
    133   }
    134 }
    135 
    136 bool LocaleAwareCompareFilenames(const base::FilePath& a,
    137                                  const base::FilePath& b) {
    138   UErrorCode error_code = U_ZERO_ERROR;
    139   // Use the default collator. The default locale should have been properly
    140   // set by the time this constructor is called.
    141   scoped_ptr<icu::Collator> collator(icu::Collator::createInstance(error_code));
    142   DCHECK(U_SUCCESS(error_code));
    143   // Make it case-sensitive.
    144   collator->setStrength(icu::Collator::TERTIARY);
    145 
    146 #if defined(OS_WIN)
    147   return base::i18n::CompareString16WithCollator(collator.get(),
    148       WideToUTF16(a.value()), WideToUTF16(b.value())) == UCOL_LESS;
    149 
    150 #elif defined(OS_POSIX)
    151   // On linux, the file system encoding is not defined. We assume
    152   // SysNativeMBToWide takes care of it.
    153   return base::i18n::CompareString16WithCollator(collator.get(),
    154       WideToUTF16(base::SysNativeMBToWide(a.value().c_str())),
    155       WideToUTF16(base::SysNativeMBToWide(b.value().c_str()))) == UCOL_LESS;
    156 #else
    157   #error Not implemented on your system
    158 #endif
    159 }
    160 
    161 void NormalizeFileNameEncoding(base::FilePath* file_name) {
    162 #if defined(OS_CHROMEOS)
    163   std::string normalized_str;
    164   if (base::ConvertToUtf8AndNormalize(file_name->BaseName().value(),
    165                                       base::kCodepageUTF8,
    166                                       &normalized_str)) {
    167     *file_name = file_name->DirName().Append(base::FilePath(normalized_str));
    168   }
    169 #endif
    170 }
    171 
    172 }  // namespace
    173