Home | History | Annotate | Download | only in minikin
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "WordBreaker.h"
     18 
     19 #include <list>
     20 #include <map>
     21 
     22 #include <unicode/uchar.h>
     23 #include <unicode/utf16.h>
     24 
     25 #include "minikin/Emoji.h"
     26 #include "minikin/Hyphenator.h"
     27 
     28 #include "Locale.h"
     29 #include "MinikinInternal.h"
     30 
     31 namespace minikin {
     32 
     33 namespace {
     34 static icu::BreakIterator* createNewIterator(const Locale& locale) {
     35     // TODO: handle failure status
     36     UErrorCode status = U_ZERO_ERROR;
     37     return icu::BreakIterator::createLineInstance(
     38             locale.isUnsupported() ? icu::Locale::getRoot()
     39                                    : icu::Locale::createFromName(locale.getString().c_str()),
     40             status);
     41 }
     42 }  // namespace
     43 
     44 ICULineBreakerPool::Slot ICULineBreakerPoolImpl::acquire(const Locale& locale) {
     45     const uint64_t id = locale.getIdentifier();
     46     std::lock_guard<std::mutex> lock(mMutex);
     47     for (auto i = mPool.begin(); i != mPool.end(); i++) {
     48         if (i->localeId == id) {
     49             Slot slot = std::move(*i);
     50             mPool.erase(i);
     51             return slot;
     52         }
     53     }
     54 
     55     // Not found in pool. Create new one.
     56     return {id, std::unique_ptr<icu::BreakIterator>(createNewIterator(locale))};
     57 }
     58 
     59 void ICULineBreakerPoolImpl::release(ICULineBreakerPool::Slot&& slot) {
     60     if (slot.breaker.get() == nullptr) {
     61         return;  // Already released slot. Do nothing.
     62     }
     63     std::lock_guard<std::mutex> lock(mMutex);
     64     if (mPool.size() >= MAX_POOL_SIZE) {
     65         // Pool is full. Move to local variable, so that the given slot will be released when the
     66         // variable leaves the scope.
     67         Slot localSlot = std::move(slot);
     68         return;
     69     }
     70     mPool.push_front(std::move(slot));
     71 }
     72 
     73 WordBreaker::WordBreaker() : mPool(&ICULineBreakerPoolImpl::getInstance()) {}
     74 
     75 WordBreaker::WordBreaker(ICULineBreakerPool* pool) : mPool(pool) {}
     76 
     77 ssize_t WordBreaker::followingWithLocale(const Locale& locale, size_t from) {
     78     mIcuBreaker = mPool->acquire(locale);
     79     UErrorCode status = U_ZERO_ERROR;
     80     MINIKIN_ASSERT(mText != nullptr, "setText must be called first");
     81     // TODO: handle failure status
     82     mIcuBreaker.breaker->setText(&mUText, status);
     83     if (mInEmailOrUrl) {
     84         // Note:
     85         // Don't reset mCurrent, mLast, or mScanOffset for keeping email/URL context.
     86         // The email/URL detection doesn't support following() functionality, so that we can't
     87         // restart from the specific position. This means following() can not be supported in
     88         // general, but keeping old email/URL context works for LineBreaker since it just wants to
     89         // re-calculate the next break point with the new locale.
     90     } else {
     91         mCurrent = mLast = mScanOffset = from;
     92         next();
     93     }
     94     return mCurrent;
     95 }
     96 
     97 void WordBreaker::setText(const uint16_t* data, size_t size) {
     98     mText = data;
     99     mTextSize = size;
    100     mLast = 0;
    101     mCurrent = 0;
    102     mScanOffset = 0;
    103     mInEmailOrUrl = false;
    104     UErrorCode status = U_ZERO_ERROR;
    105     utext_openUChars(&mUText, reinterpret_cast<const UChar*>(data), size, &status);
    106 }
    107 
    108 ssize_t WordBreaker::current() const {
    109     return mCurrent;
    110 }
    111 
    112 /**
    113  * Determine whether a line break at position i within the buffer buf is valid. This
    114  * represents customization beyond the ICU behavior, because plain ICU provides some
    115  * line break opportunities that we don't want.
    116  **/
    117 static bool isValidBreak(const uint16_t* buf, size_t bufEnd, int32_t i) {
    118     const size_t position = static_cast<size_t>(i);
    119     if (i == icu::BreakIterator::DONE || position == bufEnd) {
    120         // If the iterator reaches the end, treat as break.
    121         return true;
    122     }
    123     uint32_t codePoint;
    124     size_t prev_offset = position;
    125     U16_PREV(buf, 0, prev_offset, codePoint);
    126     // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
    127     if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
    128         return false;
    129     }
    130     // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
    131     // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
    132     // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
    133     // where no line break could be imagined, since the Myanmar virama is a pure stacker.
    134     if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
    135         return false;
    136     }
    137 
    138     uint32_t next_codepoint;
    139     size_t next_offset = position;
    140     U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
    141 
    142     // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
    143     // emoji data than ICU does.
    144     if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
    145         return false;
    146     }
    147 
    148     // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
    149     if (isEmojiModifier(next_codepoint)) {
    150         if (codePoint == 0xFE0F && prev_offset > 0) {
    151             // skip over emoji variation selector
    152             U16_PREV(buf, 0, prev_offset, codePoint);
    153         }
    154         if (isEmojiBase(codePoint)) {
    155             return false;
    156         }
    157     }
    158     return true;
    159 }
    160 
    161 // Customized iteratorNext that takes care of both resets and our modifications
    162 // to ICU's behavior.
    163 int32_t WordBreaker::iteratorNext() {
    164     int32_t result = mIcuBreaker.breaker->following(mCurrent);
    165     while (!isValidBreak(mText, mTextSize, result)) {
    166         result = mIcuBreaker.breaker->next();
    167     }
    168     return result;
    169 }
    170 
    171 // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
    172 static bool breakAfter(uint16_t c) {
    173     return c == ':' || c == '=' || c == '&';
    174 }
    175 
    176 // Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
    177 static bool breakBefore(uint16_t c) {
    178     return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' ||
    179            c == '%' || c == '=' || c == '&';
    180 }
    181 
    182 enum ScanState {
    183     START,
    184     SAW_AT,
    185     SAW_COLON,
    186     SAW_COLON_SLASH,
    187     SAW_COLON_SLASH_SLASH,
    188 };
    189 
    190 void WordBreaker::detectEmailOrUrl() {
    191     // scan forward from current ICU position for email address or URL
    192     if (mLast >= mScanOffset) {
    193         ScanState state = START;
    194         size_t i;
    195         for (i = mLast; i < mTextSize; i++) {
    196             uint16_t c = mText[i];
    197             // scan only ASCII characters, stop at space
    198             if (!(' ' < c && c <= 0x007E)) {
    199                 break;
    200             }
    201             if (state == START && c == '@') {
    202                 state = SAW_AT;
    203             } else if (state == START && c == ':') {
    204                 state = SAW_COLON;
    205             } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
    206                 if (c == '/') {
    207                     state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
    208                 } else {
    209                     state = START;
    210                 }
    211             }
    212         }
    213         if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
    214             if (!mIcuBreaker.breaker->isBoundary(i)) {
    215                 // If there are combining marks or such at the end of the URL or the email address,
    216                 // consider them a part of the URL or the email, and skip to the next actual
    217                 // boundary.
    218                 i = mIcuBreaker.breaker->following(i);
    219             }
    220             mInEmailOrUrl = true;
    221         } else {
    222             mInEmailOrUrl = false;
    223         }
    224         mScanOffset = i;
    225     }
    226 }
    227 
    228 ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
    229     // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
    230     uint16_t lastChar = mText[mLast];
    231     ssize_t i;
    232     for (i = mLast + 1; i < mScanOffset; i++) {
    233         if (breakAfter(lastChar)) {
    234             break;
    235         }
    236         // break after double slash
    237         if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
    238             break;
    239         }
    240         const uint16_t thisChar = mText[i];
    241         // never break after hyphen
    242         if (lastChar != '-') {
    243             if (breakBefore(thisChar)) {
    244                 break;
    245             }
    246             // break before single slash
    247             if (thisChar == '/' && lastChar != '/' &&
    248                 !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
    249                 break;
    250             }
    251         }
    252         lastChar = thisChar;
    253     }
    254     return i;
    255 }
    256 
    257 ssize_t WordBreaker::next() {
    258     mLast = mCurrent;
    259 
    260     detectEmailOrUrl();
    261     if (mInEmailOrUrl) {
    262         mCurrent = findNextBreakInEmailOrUrl();
    263     } else {  // Business as usual
    264         mCurrent = (ssize_t)iteratorNext();
    265     }
    266     return mCurrent;
    267 }
    268 
    269 ssize_t WordBreaker::wordStart() const {
    270     if (mInEmailOrUrl) {
    271         return mLast;
    272     }
    273     ssize_t result = mLast;
    274     while (result < mCurrent) {
    275         UChar32 c;
    276         ssize_t ix = result;
    277         U16_NEXT(mText, ix, mCurrent, c);
    278         const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
    279         // strip leading punctuation, defined as OP and QU line breaking classes,
    280         // see UAX #14
    281         if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
    282             break;
    283         }
    284         result = ix;
    285     }
    286     return result;
    287 }
    288 
    289 ssize_t WordBreaker::wordEnd() const {
    290     if (mInEmailOrUrl) {
    291         return mLast;
    292     }
    293     ssize_t result = mCurrent;
    294     while (result > mLast) {
    295         UChar32 c;
    296         ssize_t ix = result;
    297         U16_PREV(mText, mLast, ix, c);
    298         const int32_t gc_mask = U_GET_GC_MASK(c);
    299         // strip trailing spaces, punctuation and control characters
    300         if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK | U_GC_CC_MASK)) == 0) {
    301             break;
    302         }
    303         result = ix;
    304     }
    305     return result;
    306 }
    307 
    308 int WordBreaker::breakBadness() const {
    309     return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
    310 }
    311 
    312 void WordBreaker::finish() {
    313     mText = nullptr;
    314     // Note: calling utext_close multiply is safe
    315     utext_close(&mUText);
    316     mPool->release(std::move(mIcuBreaker));
    317 }
    318 
    319 }  // namespace minikin
    320