Home | History | Annotate | Download | only in minikin
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /**
     18  * A wrapper around ICU's line break iterator, that gives customized line
     19  * break opportunities, as well as identifying words for the purpose of
     20  * hyphenation.
     21  */
     22 
     23 #ifndef MINIKIN_WORD_BREAKER_H
     24 #define MINIKIN_WORD_BREAKER_H
     25 
     26 #include <list>
     27 #include <mutex>
     28 
     29 #include <unicode/brkiter.h>
     30 
     31 #include "minikin/Macros.h"
     32 #include "minikin/Range.h"
     33 
     34 #include "Locale.h"
     35 
     36 namespace minikin {
     37 
     38 // A class interface for providing pooling implementation of ICU's line breaker.
     39 // The implementation can be customized for testing purposes.
     40 class ICULineBreakerPool {
     41 public:
     42     struct Slot {
     43         Slot() : localeId(0), breaker(nullptr) {}
     44         Slot(uint64_t localeId, std::unique_ptr<icu::BreakIterator>&& breaker)
     45                 : localeId(localeId), breaker(std::move(breaker)) {}
     46 
     47         Slot(Slot&& other) = default;
     48         Slot& operator=(Slot&& other) = default;
     49 
     50         // Forbid copy and assignment.
     51         Slot(const Slot&) = delete;
     52         Slot& operator=(const Slot&) = delete;
     53 
     54         uint64_t localeId;
     55         std::unique_ptr<icu::BreakIterator> breaker;
     56     };
     57     virtual ~ICULineBreakerPool() {}
     58     virtual Slot acquire(const Locale& locale) = 0;
     59     virtual void release(Slot&& slot) = 0;
     60 };
     61 
     62 // An singleton implementation of the ICU line breaker pool.
     63 // Since creating ICU line breaker instance takes some time. Pool it for later use.
     64 class ICULineBreakerPoolImpl : public ICULineBreakerPool {
     65 public:
     66     Slot acquire(const Locale& locale) override;
     67     void release(Slot&& slot) override;
     68 
     69     static ICULineBreakerPoolImpl& getInstance() {
     70         static ICULineBreakerPoolImpl pool;
     71         return pool;
     72     }
     73 
     74 protected:
     75     // protected for testing purposes.
     76     static constexpr size_t MAX_POOL_SIZE = 4;
     77     ICULineBreakerPoolImpl(){};  // singleton.
     78     size_t getPoolSize() const {
     79         std::lock_guard<std::mutex> lock(mMutex);
     80         return mPool.size();
     81     }
     82 
     83 private:
     84     std::list<Slot> mPool GUARDED_BY(mMutex);
     85     mutable std::mutex mMutex;
     86 };
     87 
     88 class WordBreaker {
     89 public:
     90     virtual ~WordBreaker() { finish(); }
     91 
     92     WordBreaker();
     93 
     94     void setText(const uint16_t* data, size_t size);
     95 
     96     // Advance iterator to next word break with current locale. Return offset, or -1 if EOT
     97     ssize_t next();
     98 
     99     // Advance iterator to the break just after "from" with using the new provided locale.
    100     // Return offset, or -1 if EOT
    101     ssize_t followingWithLocale(const Locale& locale, size_t from);
    102 
    103     // Current offset of iterator, equal to 0 at BOT or last return from next()
    104     ssize_t current() const;
    105 
    106     // After calling next(), wordStart() and wordEnd() are offsets defining the previous
    107     // word. If wordEnd <= wordStart, it's not a word for the purpose of hyphenation.
    108     ssize_t wordStart() const;
    109 
    110     ssize_t wordEnd() const;
    111 
    112     // Returns the range from wordStart() to wordEnd().
    113     // If wordEnd() <= wordStart(), returns empty range.
    114     inline Range wordRange() const {
    115         const uint32_t start = wordStart();
    116         const uint32_t end = wordEnd();
    117         return start < end ? Range(start, end) : Range(end, end);
    118     }
    119 
    120     int breakBadness() const;
    121 
    122     void finish();
    123 
    124 protected:
    125     // protected virtual for testing purpose.
    126     // Caller must release the pool.
    127     WordBreaker(ICULineBreakerPool* pool);
    128 
    129 private:
    130     int32_t iteratorNext();
    131     void detectEmailOrUrl();
    132     ssize_t findNextBreakInEmailOrUrl();
    133 
    134     // Doesn't take ownership. Must not be nullptr. Must be set in constructor.
    135     ICULineBreakerPool* mPool;
    136 
    137     ICULineBreakerPool::Slot mIcuBreaker;
    138 
    139     UText mUText = UTEXT_INITIALIZER;
    140     const uint16_t* mText = nullptr;
    141     size_t mTextSize;
    142     ssize_t mLast;
    143     ssize_t mCurrent;
    144 
    145     // state for the email address / url detector
    146     ssize_t mScanOffset;
    147     bool mInEmailOrUrl;
    148 };
    149 
    150 }  // namespace minikin
    151 
    152 #endif  // MINIKIN_WORD_BREAKER_H
    153