Home | History | Annotate | Download | only in minikin
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /**
     18  * An implementation of Liang's hyphenation algorithm.
     19  */
     20 
     21 #include "unicode/locid.h"
     22 #include <memory>
     23 #include <unordered_map>
     24 
     25 #ifndef MINIKIN_HYPHENATOR_H
     26 #define MINIKIN_HYPHENATOR_H
     27 
     28 namespace minikin {
     29 
     30 enum class HyphenationType : uint8_t {
     31     // Note: There are implicit assumptions scattered in the code that DONT_BREAK is 0.
     32 
     33     // Do not break.
     34     DONT_BREAK = 0,
     35     // Break the line and insert a normal hyphen.
     36     BREAK_AND_INSERT_HYPHEN = 1,
     37     // Break the line and insert an Armenian hyphen (U+058A).
     38     BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2,
     39     // Break the line and insert a maqaf (Hebrew hyphen, U+05BE).
     40     BREAK_AND_INSERT_MAQAF = 3,
     41     // Break the line and insert a Canadian Syllabics hyphen (U+1400).
     42     BREAK_AND_INSERT_UCAS_HYPHEN = 4,
     43     // Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen
     44     // present or the script does not use a hyphen (e.g. in Malayalam).
     45     BREAK_AND_DONT_INSERT_HYPHEN = 5,
     46     // Break and replace the last code unit with hyphen. Used for Catalan "ll" which hyphenates
     47     // as "l-/l".
     48     BREAK_AND_REPLACE_WITH_HYPHEN = 6,
     49     // Break the line, and repeat the hyphen (which is the last character) at the beginning of the
     50     // next line. Used in Polish, where "czerwono-niebieska" should hyphenate as
     51     // "czerwono-/-niebieska".
     52     BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7,
     53     // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
     54     // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
     55     // behavior when a soft hyphen is used in Arabic script.
     56     BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
     57 };
     58 
     59 // The hyphen edit represents an edit to the string when a word is
     60 // hyphenated. The most common hyphen edit is adding a "-" at the end
     61 // of a syllable, but nonstandard hyphenation allows for more choices.
     62 // Note that a HyphenEdit can hold two types of edits at the same time,
     63 // One at the beginning of the string/line and one at the end.
     64 class HyphenEdit {
     65 public:
     66     static const uint32_t NO_EDIT = 0x00;
     67 
     68     static const uint32_t INSERT_HYPHEN_AT_END = 0x01;
     69     static const uint32_t INSERT_ARMENIAN_HYPHEN_AT_END = 0x02;
     70     static const uint32_t INSERT_MAQAF_AT_END = 0x03;
     71     static const uint32_t INSERT_UCAS_HYPHEN_AT_END = 0x04;
     72     static const uint32_t INSERT_ZWJ_AND_HYPHEN_AT_END = 0x05;
     73     static const uint32_t REPLACE_WITH_HYPHEN_AT_END = 0x06;
     74     static const uint32_t BREAK_AT_END = 0x07;
     75 
     76     static const uint32_t INSERT_HYPHEN_AT_START = 0x01 << 3;
     77     static const uint32_t INSERT_ZWJ_AT_START = 0x02 << 3;
     78     static const uint32_t BREAK_AT_START = 0x03 << 3;
     79 
     80     // Keep in sync with the definitions in the Java code at:
     81     // frameworks/base/graphics/java/android/graphics/Paint.java
     82     static const uint32_t MASK_END_OF_LINE = 0x07;
     83     static const uint32_t MASK_START_OF_LINE = 0x03 << 3;
     84 
     85     inline static bool isReplacement(uint32_t hyph) {
     86         return hyph == REPLACE_WITH_HYPHEN_AT_END;
     87     }
     88 
     89     inline static bool isInsertion(uint32_t hyph) {
     90         return (hyph == INSERT_HYPHEN_AT_END
     91                 || hyph == INSERT_ARMENIAN_HYPHEN_AT_END
     92                 || hyph == INSERT_MAQAF_AT_END
     93                 || hyph == INSERT_UCAS_HYPHEN_AT_END
     94                 || hyph == INSERT_ZWJ_AND_HYPHEN_AT_END
     95                 || hyph == INSERT_HYPHEN_AT_START
     96                 || hyph == INSERT_ZWJ_AT_START);
     97     }
     98 
     99     const static uint32_t* getHyphenString(uint32_t hyph);
    100     static uint32_t editForThisLine(HyphenationType type);
    101     static uint32_t editForNextLine(HyphenationType type);
    102 
    103     HyphenEdit() : hyphen(NO_EDIT) { }
    104     HyphenEdit(uint32_t hyphenInt) : hyphen(hyphenInt) { }  // NOLINT(implicit)
    105     uint32_t getHyphen() const { return hyphen; }
    106     bool operator==(const HyphenEdit &other) const { return hyphen == other.hyphen; }
    107 
    108     uint32_t getEnd() const { return hyphen & MASK_END_OF_LINE; }
    109     uint32_t getStart() const { return hyphen & MASK_START_OF_LINE; }
    110 
    111 private:
    112     uint32_t hyphen;
    113 };
    114 
    115 // hyb file header; implementation details are in the .cpp file
    116 struct Header;
    117 
    118 class Hyphenator {
    119 public:
    120     // Compute the hyphenation of a word, storing the hyphenation in result vector. Each entry in
    121     // the vector is a "hyphenation type" for a potential hyphenation that can be applied at the
    122     // corresponding code unit offset in the word.
    123     //
    124     // Example: word is "hyphen", result is the following, corresponding to "hy-phen":
    125     // [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK, DONT_BREAK, DONT_BREAK]
    126     void hyphenate(std::vector<HyphenationType>* result, const uint16_t* word, size_t len,
    127             const icu::Locale& locale);
    128 
    129     // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and usage: a character
    130     // immediately after which line breaks are allowed, but words containing it should not be
    131     // automatically hyphenated.
    132     static bool isLineBreakingHyphen(uint32_t cp);
    133 
    134     // pattern data is in binary format, as described in doc/hyb_file_format.md. Note:
    135     // the caller is responsible for ensuring that the lifetime of the pattern data is
    136     // at least as long as the Hyphenator object.
    137 
    138     // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens.
    139     static Hyphenator* loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix);
    140 
    141 private:
    142     // apply various hyphenation rules including hard and soft hyphens, ignoring patterns
    143     void hyphenateWithNoPatterns(HyphenationType* result, const uint16_t* word, size_t len,
    144             const icu::Locale& locale);
    145 
    146     // Try looking up word in alphabet table, return DONT_BREAK if any code units fail to map.
    147     // Otherwise, returns BREAK_AND_INSERT_HYPHEN, BREAK_AND_INSERT_ARMENIAN_HYPHEN, or
    148     // BREAK_AND_DONT_INSERT_HYPHEN based on the the script of the characters seen.
    149     // Note that this method writes len+2 entries into alpha_codes (including start and stop)
    150     HyphenationType alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len);
    151 
    152     // calculate hyphenation from patterns, assuming alphabet lookup has already been done
    153     void hyphenateFromCodes(HyphenationType* result, const uint16_t* codes, size_t len,
    154             HyphenationType hyphenValue);
    155 
    156     // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
    157     // that temporary buffers can be stack-allocated without waste, which is a slightly
    158     // different use case. It measures UTF-16 code units.
    159     static const size_t MAX_HYPHENATED_SIZE = 64;
    160 
    161     const uint8_t* patternData;
    162     size_t minPrefix, minSuffix;
    163 
    164     // accessors for binary data
    165     const Header* getHeader() const {
    166         return reinterpret_cast<const Header*>(patternData);
    167     }
    168 
    169 };
    170 
    171 }  // namespace minikin
    172 
    173 #endif   // MINIKIN_HYPHENATOR_H
    174