Home | History | Annotate | Download | only in i18n
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/i18n/break_iterator.h"
      6 
      7 #include "base/logging.h"
      8 #include "third_party/icu/source/common/unicode/ubrk.h"
      9 #include "third_party/icu/source/common/unicode/uchar.h"
     10 #include "third_party/icu/source/common/unicode/ustring.h"
     11 
     12 namespace base {
     13 namespace i18n {
     14 
     15 const size_t npos = static_cast<size_t>(-1);
     16 
     17 BreakIterator::BreakIterator(const string16& str, BreakType break_type)
     18     : iter_(NULL),
     19       string_(str),
     20       break_type_(break_type),
     21       prev_(npos),
     22       pos_(0) {
     23 }
     24 
     25 BreakIterator::BreakIterator(const string16& str, const string16& rules)
     26     : iter_(NULL),
     27       string_(str),
     28       rules_(rules),
     29       break_type_(RULE_BASED),
     30       prev_(npos),
     31       pos_(0) {
     32 }
     33 
     34 BreakIterator::~BreakIterator() {
     35   if (iter_)
     36     ubrk_close(static_cast<UBreakIterator*>(iter_));
     37 }
     38 
     39 bool BreakIterator::Init() {
     40   UErrorCode status = U_ZERO_ERROR;
     41   UParseError parse_error;
     42   UBreakIteratorType break_type;
     43   switch (break_type_) {
     44     case BREAK_CHARACTER:
     45       break_type = UBRK_CHARACTER;
     46       break;
     47     case BREAK_WORD:
     48       break_type = UBRK_WORD;
     49       break;
     50     case BREAK_LINE:
     51     case BREAK_NEWLINE:
     52     case RULE_BASED: // (Keep compiler happy, break_type not used in this case)
     53       break_type = UBRK_LINE;
     54       break;
     55     default:
     56       NOTREACHED() << "invalid break_type_";
     57       return false;
     58   }
     59   if (break_type_ == RULE_BASED) {
     60     iter_ = ubrk_openRules(rules_.c_str(),
     61                            static_cast<int32_t>(rules_.length()),
     62                            string_.data(),
     63                            static_cast<int32_t>(string_.size()),
     64                            &parse_error,
     65                            &status);
     66     if (U_FAILURE(status)) {
     67       NOTREACHED() << "ubrk_openRules failed to parse rule string at line "
     68           << parse_error.line << ", offset " << parse_error.offset;
     69     }
     70   } else {
     71     iter_ = ubrk_open(break_type,
     72                       NULL,
     73                       string_.data(),
     74                       static_cast<int32_t>(string_.size()),
     75                       &status);
     76     if (U_FAILURE(status)) {
     77       NOTREACHED() << "ubrk_open failed";
     78     }
     79   }
     80 
     81   if (U_FAILURE(status)) {
     82     return false;
     83   }
     84 
     85   // Move the iterator to the beginning of the string.
     86   ubrk_first(static_cast<UBreakIterator*>(iter_));
     87   return true;
     88 }
     89 
     90 bool BreakIterator::Advance() {
     91   int32_t pos;
     92   int32_t status;
     93   prev_ = pos_;
     94   switch (break_type_) {
     95     case BREAK_CHARACTER:
     96     case BREAK_WORD:
     97     case BREAK_LINE:
     98     case RULE_BASED:
     99       pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
    100       if (pos == UBRK_DONE) {
    101         pos_ = npos;
    102         return false;
    103       }
    104       pos_ = static_cast<size_t>(pos);
    105       return true;
    106     case BREAK_NEWLINE:
    107       do {
    108         pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
    109         if (pos == UBRK_DONE)
    110           break;
    111         pos_ = static_cast<size_t>(pos);
    112         status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
    113       } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT);
    114       if (pos == UBRK_DONE && prev_ == pos_) {
    115         pos_ = npos;
    116         return false;
    117       }
    118       return true;
    119     default:
    120       NOTREACHED() << "invalid break_type_";
    121       return false;
    122   }
    123 }
    124 
    125 bool BreakIterator::SetText(const base::char16* text, const size_t length) {
    126   UErrorCode status = U_ZERO_ERROR;
    127   ubrk_setText(static_cast<UBreakIterator*>(iter_),
    128                text, length, &status);
    129   pos_ = 0;  // implicit when ubrk_setText is done
    130   prev_ = npos;
    131   if (U_FAILURE(status)) {
    132     NOTREACHED() << "ubrk_setText failed";
    133     return false;
    134   }
    135   return true;
    136 }
    137 
    138 bool BreakIterator::IsWord() const {
    139   int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
    140   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
    141     return false;
    142   return status != UBRK_WORD_NONE;
    143 }
    144 
    145 bool BreakIterator::IsEndOfWord(size_t position) const {
    146   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
    147     return false;
    148 
    149   UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
    150   UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position));
    151   int32_t status = ubrk_getRuleStatus(iter);
    152   return (!!boundary && status != UBRK_WORD_NONE);
    153 }
    154 
    155 bool BreakIterator::IsStartOfWord(size_t position) const {
    156   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
    157     return false;
    158 
    159   UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
    160   UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position));
    161   ubrk_next(iter);
    162   int32_t next_status = ubrk_getRuleStatus(iter);
    163   return (!!boundary && next_status != UBRK_WORD_NONE);
    164 }
    165 
    166 bool BreakIterator::IsGraphemeBoundary(size_t position) const {
    167   if (break_type_ != BREAK_CHARACTER)
    168     return false;
    169 
    170   UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
    171   return !!ubrk_isBoundary(iter, static_cast<int32_t>(position));
    172 }
    173 
    174 string16 BreakIterator::GetString() const {
    175   DCHECK(prev_ != npos && pos_ != npos);
    176   return string_.substr(prev_, pos_ - prev_);
    177 }
    178 
    179 }  // namespace i18n
    180 }  // namespace base
    181