1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "WordBreaker.h" 18 19 #include <list> 20 #include <map> 21 22 #include <unicode/uchar.h> 23 #include <unicode/utf16.h> 24 25 #include "minikin/Emoji.h" 26 #include "minikin/Hyphenator.h" 27 28 #include "Locale.h" 29 #include "MinikinInternal.h" 30 31 namespace minikin { 32 33 namespace { 34 static icu::BreakIterator* createNewIterator(const Locale& locale) { 35 // TODO: handle failure status 36 UErrorCode status = U_ZERO_ERROR; 37 return icu::BreakIterator::createLineInstance( 38 locale.isUnsupported() ? icu::Locale::getRoot() 39 : icu::Locale::createFromName(locale.getString().c_str()), 40 status); 41 } 42 } // namespace 43 44 ICULineBreakerPool::Slot ICULineBreakerPoolImpl::acquire(const Locale& locale) { 45 const uint64_t id = locale.getIdentifier(); 46 std::lock_guard<std::mutex> lock(mMutex); 47 for (auto i = mPool.begin(); i != mPool.end(); i++) { 48 if (i->localeId == id) { 49 Slot slot = std::move(*i); 50 mPool.erase(i); 51 return slot; 52 } 53 } 54 55 // Not found in pool. Create new one. 56 return {id, std::unique_ptr<icu::BreakIterator>(createNewIterator(locale))}; 57 } 58 59 void ICULineBreakerPoolImpl::release(ICULineBreakerPool::Slot&& slot) { 60 if (slot.breaker.get() == nullptr) { 61 return; // Already released slot. Do nothing. 62 } 63 std::lock_guard<std::mutex> lock(mMutex); 64 if (mPool.size() >= MAX_POOL_SIZE) { 65 // Pool is full. Move to local variable, so that the given slot will be released when the 66 // variable leaves the scope. 67 Slot localSlot = std::move(slot); 68 return; 69 } 70 mPool.push_front(std::move(slot)); 71 } 72 73 WordBreaker::WordBreaker() : mPool(&ICULineBreakerPoolImpl::getInstance()) {} 74 75 WordBreaker::WordBreaker(ICULineBreakerPool* pool) : mPool(pool) {} 76 77 ssize_t WordBreaker::followingWithLocale(const Locale& locale, size_t from) { 78 mIcuBreaker = mPool->acquire(locale); 79 UErrorCode status = U_ZERO_ERROR; 80 MINIKIN_ASSERT(mText != nullptr, "setText must be called first"); 81 // TODO: handle failure status 82 mIcuBreaker.breaker->setText(&mUText, status); 83 if (mInEmailOrUrl) { 84 // Note: 85 // Don't reset mCurrent, mLast, or mScanOffset for keeping email/URL context. 86 // The email/URL detection doesn't support following() functionality, so that we can't 87 // restart from the specific position. This means following() can not be supported in 88 // general, but keeping old email/URL context works for LineBreaker since it just wants to 89 // re-calculate the next break point with the new locale. 90 } else { 91 mCurrent = mLast = mScanOffset = from; 92 next(); 93 } 94 return mCurrent; 95 } 96 97 void WordBreaker::setText(const uint16_t* data, size_t size) { 98 mText = data; 99 mTextSize = size; 100 mLast = 0; 101 mCurrent = 0; 102 mScanOffset = 0; 103 mInEmailOrUrl = false; 104 UErrorCode status = U_ZERO_ERROR; 105 utext_openUChars(&mUText, reinterpret_cast<const UChar*>(data), size, &status); 106 } 107 108 ssize_t WordBreaker::current() const { 109 return mCurrent; 110 } 111 112 /** 113 * Determine whether a line break at position i within the buffer buf is valid. This 114 * represents customization beyond the ICU behavior, because plain ICU provides some 115 * line break opportunities that we don't want. 116 **/ 117 static bool isValidBreak(const uint16_t* buf, size_t bufEnd, int32_t i) { 118 const size_t position = static_cast<size_t>(i); 119 if (i == icu::BreakIterator::DONE || position == bufEnd) { 120 // If the iterator reaches the end, treat as break. 121 return true; 122 } 123 uint32_t codePoint; 124 size_t prev_offset = position; 125 U16_PREV(buf, 0, prev_offset, codePoint); 126 // Do not break on hard or soft hyphens. These are handled by automatic hyphenation. 127 if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) { 128 return false; 129 } 130 // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go 131 // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid 132 // too much looking around in the strings, we simply avoid breaking after any Myanmar virama, 133 // where no line break could be imagined, since the Myanmar virama is a pure stacker. 134 if (codePoint == 0x1039) { // MYANMAR SIGN VIRAMA 135 return false; 136 } 137 138 uint32_t next_codepoint; 139 size_t next_offset = position; 140 U16_NEXT(buf, next_offset, bufEnd, next_codepoint); 141 142 // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher 143 // emoji data than ICU does. 144 if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) { 145 return false; 146 } 147 148 // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does. 149 if (isEmojiModifier(next_codepoint)) { 150 if (codePoint == 0xFE0F && prev_offset > 0) { 151 // skip over emoji variation selector 152 U16_PREV(buf, 0, prev_offset, codePoint); 153 } 154 if (isEmojiBase(codePoint)) { 155 return false; 156 } 157 } 158 return true; 159 } 160 161 // Customized iteratorNext that takes care of both resets and our modifications 162 // to ICU's behavior. 163 int32_t WordBreaker::iteratorNext() { 164 int32_t result = mIcuBreaker.breaker->following(mCurrent); 165 while (!isValidBreak(mText, mTextSize, result)) { 166 result = mIcuBreaker.breaker->next(); 167 } 168 return result; 169 } 170 171 // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses 172 static bool breakAfter(uint16_t c) { 173 return c == ':' || c == '=' || c == '&'; 174 } 175 176 // Chicago Manual of Style recommends breaking before these characters in URLs and email addresses 177 static bool breakBefore(uint16_t c) { 178 return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' || 179 c == '%' || c == '=' || c == '&'; 180 } 181 182 enum ScanState { 183 START, 184 SAW_AT, 185 SAW_COLON, 186 SAW_COLON_SLASH, 187 SAW_COLON_SLASH_SLASH, 188 }; 189 190 void WordBreaker::detectEmailOrUrl() { 191 // scan forward from current ICU position for email address or URL 192 if (mLast >= mScanOffset) { 193 ScanState state = START; 194 size_t i; 195 for (i = mLast; i < mTextSize; i++) { 196 uint16_t c = mText[i]; 197 // scan only ASCII characters, stop at space 198 if (!(' ' < c && c <= 0x007E)) { 199 break; 200 } 201 if (state == START && c == '@') { 202 state = SAW_AT; 203 } else if (state == START && c == ':') { 204 state = SAW_COLON; 205 } else if (state == SAW_COLON || state == SAW_COLON_SLASH) { 206 if (c == '/') { 207 state = static_cast<ScanState>((int)state + 1); // next state adds a slash 208 } else { 209 state = START; 210 } 211 } 212 } 213 if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) { 214 if (!mIcuBreaker.breaker->isBoundary(i)) { 215 // If there are combining marks or such at the end of the URL or the email address, 216 // consider them a part of the URL or the email, and skip to the next actual 217 // boundary. 218 i = mIcuBreaker.breaker->following(i); 219 } 220 mInEmailOrUrl = true; 221 } else { 222 mInEmailOrUrl = false; 223 } 224 mScanOffset = i; 225 } 226 } 227 228 ssize_t WordBreaker::findNextBreakInEmailOrUrl() { 229 // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.) 230 uint16_t lastChar = mText[mLast]; 231 ssize_t i; 232 for (i = mLast + 1; i < mScanOffset; i++) { 233 if (breakAfter(lastChar)) { 234 break; 235 } 236 // break after double slash 237 if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') { 238 break; 239 } 240 const uint16_t thisChar = mText[i]; 241 // never break after hyphen 242 if (lastChar != '-') { 243 if (breakBefore(thisChar)) { 244 break; 245 } 246 // break before single slash 247 if (thisChar == '/' && lastChar != '/' && 248 !(i + 1 < mScanOffset && mText[i + 1] == '/')) { 249 break; 250 } 251 } 252 lastChar = thisChar; 253 } 254 return i; 255 } 256 257 ssize_t WordBreaker::next() { 258 mLast = mCurrent; 259 260 detectEmailOrUrl(); 261 if (mInEmailOrUrl) { 262 mCurrent = findNextBreakInEmailOrUrl(); 263 } else { // Business as usual 264 mCurrent = (ssize_t)iteratorNext(); 265 } 266 return mCurrent; 267 } 268 269 ssize_t WordBreaker::wordStart() const { 270 if (mInEmailOrUrl) { 271 return mLast; 272 } 273 ssize_t result = mLast; 274 while (result < mCurrent) { 275 UChar32 c; 276 ssize_t ix = result; 277 U16_NEXT(mText, ix, mCurrent, c); 278 const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK); 279 // strip leading punctuation, defined as OP and QU line breaking classes, 280 // see UAX #14 281 if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) { 282 break; 283 } 284 result = ix; 285 } 286 return result; 287 } 288 289 ssize_t WordBreaker::wordEnd() const { 290 if (mInEmailOrUrl) { 291 return mLast; 292 } 293 ssize_t result = mCurrent; 294 while (result > mLast) { 295 UChar32 c; 296 ssize_t ix = result; 297 U16_PREV(mText, mLast, ix, c); 298 const int32_t gc_mask = U_GET_GC_MASK(c); 299 // strip trailing spaces, punctuation and control characters 300 if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK | U_GC_CC_MASK)) == 0) { 301 break; 302 } 303 result = ix; 304 } 305 return result; 306 } 307 308 int WordBreaker::breakBadness() const { 309 return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0; 310 } 311 312 void WordBreaker::finish() { 313 mText = nullptr; 314 // Note: calling utext_close multiply is safe 315 utext_close(&mUText); 316 mPool->release(std::move(mIcuBreaker)); 317 } 318 319 } // namespace minikin 320