Home | History | Annotate | Download | only in i18n
      1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/i18n/word_iterator.h"
      6 
      7 #include "base/logging.h"
      8 #include "unicode/ubrk.h"
      9 #include "unicode/ustring.h"
     10 
     11 const size_t npos = -1;
     12 
     13 WordIterator::WordIterator(const std::wstring& str, BreakType break_type)
     14     : iter_(NULL),
     15       string_(str),
     16       break_type_(break_type),
     17       prev_(npos),
     18       pos_(0) {
     19 }
     20 
     21 WordIterator::~WordIterator() {
     22   if (iter_)
     23     ubrk_close(iter_);
     24 }
     25 
     26 bool WordIterator::Init() {
     27   UErrorCode status = U_ZERO_ERROR;
     28   UBreakIteratorType break_type;
     29   switch (break_type_) {
     30     case BREAK_WORD:
     31       break_type = UBRK_WORD;
     32       break;
     33     case BREAK_LINE:
     34       break_type = UBRK_LINE;
     35       break;
     36     default:
     37       NOTREACHED();
     38       break_type = UBRK_LINE;
     39   }
     40 #if defined(WCHAR_T_IS_UTF16)
     41   iter_ = ubrk_open(break_type, NULL,
     42                     string_.data(), static_cast<int32_t>(string_.size()),
     43                     &status);
     44 #else  // WCHAR_T_IS_UTF16
     45   // When wchar_t is wider than UChar (16 bits), transform |string_| into a
     46   // UChar* string.  Size the UChar* buffer to be large enough to hold twice
     47   // as many UTF-16 code points as there are UCS-4 characters, in case each
     48   // character translates to a UTF-16 surrogate pair, and leave room for a NUL
     49   // terminator.
     50   // TODO(avi): avoid this alloc
     51   chars_.resize(string_.length() * sizeof(UChar) + 1);
     52 
     53   UErrorCode error = U_ZERO_ERROR;
     54   int32_t destLength;
     55   u_strFromWCS(&chars_[0], chars_.size(), &destLength, string_.data(),
     56                string_.length(), &error);
     57 
     58   iter_ = ubrk_open(break_type, NULL, &chars_[0], destLength, &status);
     59 #endif
     60   if (U_FAILURE(status)) {
     61     NOTREACHED() << "ubrk_open failed";
     62     return false;
     63   }
     64   ubrk_first(iter_);  // Move the iterator to the beginning of the string.
     65   return true;
     66 }
     67 
     68 bool WordIterator::Advance() {
     69   prev_ = pos_;
     70   const int32_t pos = ubrk_next(iter_);
     71   if (pos == UBRK_DONE) {
     72     pos_ = npos;
     73     return false;
     74   } else {
     75     pos_ = static_cast<size_t>(pos);
     76     return true;
     77   }
     78 }
     79 
     80 bool WordIterator::IsWord() const {
     81   return (ubrk_getRuleStatus(iter_) != UBRK_WORD_NONE);
     82 }
     83 
     84 std::wstring WordIterator::GetWord() const {
     85   DCHECK(prev_ != npos && pos_ != npos);
     86 #if defined(WCHAR_T_IS_UTF16)
     87   return string_.substr(prev_, pos_ - prev_);
     88 #else  // WCHAR_T_IS_UTF16
     89   // See comment in Init().  If there are no surrogate pairs,
     90   // |out_length| will be exactly |in_length|, if there are surrogate
     91   // pairs it will be less than |in_length|.
     92   int32_t out_length;
     93   UErrorCode error = U_ZERO_ERROR;
     94   const int32_t in_length = pos_ - prev_;
     95   std::vector<std::wstring::value_type> out_buffer(in_length);
     96   u_strToWCS(&out_buffer[0], in_length, &out_length,
     97              &chars_[prev_], in_length, &error);
     98   DCHECK_LE(out_length, in_length);
     99   return std::wstring(&out_buffer[0], out_length);
    100 #endif
    101 }
    102