Home | History | Annotate | Download | only in minikin
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "minikin/GraphemeBreak.h"
     18 
     19 #include <algorithm>
     20 #include <cstdint>
     21 
     22 #include <unicode/uchar.h>
     23 #include <unicode/utf16.h>
     24 
     25 #include "minikin/Emoji.h"
     26 
     27 namespace minikin {
     28 
     29 int32_t tailoredGraphemeClusterBreak(uint32_t c) {
     30     // Characters defined as Control that we want to treat them as Extend.
     31     // These are curated manually.
     32     if (c == 0x00AD                      // SHY
     33         || c == 0x061C                   // ALM
     34         || c == 0x180E                   // MONGOLIAN VOWEL SEPARATOR
     35         || c == 0x200B                   // ZWSP
     36         || c == 0x200E                   // LRM
     37         || c == 0x200F                   // RLM
     38         || (0x202A <= c && c <= 0x202E)  // LRE, RLE, PDF, LRO, RLO
     39         || ((c | 0xF) == 0x206F)         // WJ, invisible math operators, LRI, RLI, FSI, PDI,
     40                                          // and the deprecated invisible format controls
     41         || c == 0xFEFF                   // BOM
     42         || ((c | 0x7F) == 0xE007F))      // recently undeprecated tag characters in Plane 14
     43         return U_GCB_EXTEND;
     44     // THAI CHARACTER SARA AM is treated as a normal letter by most other implementations: they
     45     // allow a grapheme break before it.
     46     else if (c == 0x0E33)
     47         return U_GCB_OTHER;
     48     else
     49         return u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
     50 }
     51 
     52 // Returns true for all characters whose IndicSyllabicCategory is Pure_Killer.
     53 // From http://www.unicode.org/Public/9.0.0/ucd/IndicSyllabicCategory.txt
     54 bool isPureKiller(uint32_t c) {
     55     return (c == 0x0E3A || c == 0x0E4E || c == 0x0F84 || c == 0x103A || c == 0x1714 ||
     56             c == 0x1734 || c == 0x17D1 || c == 0x1BAA || c == 0x1BF2 || c == 0x1BF3 ||
     57             c == 0xA806 || c == 0xA953 || c == 0xABED || c == 0x11134 || c == 0x112EA ||
     58             c == 0x1172B);
     59 }
     60 
     61 bool GraphemeBreak::isGraphemeBreak(const float* advances, const uint16_t* buf, size_t start,
     62                                     size_t count, const size_t offset) {
     63     // This implementation closely follows Unicode Standard Annex #29 on
     64     // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/),
     65     // implementing a tailored version of extended grapheme clusters.
     66     // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules.
     67 
     68     // Rule GB1, sot ; Rule GB2,  eot
     69     if (offset <= start || offset >= start + count) {
     70         return true;
     71     }
     72     if (U16_IS_TRAIL(buf[offset])) {
     73         // Don't break a surrogate pair, but a lonely trailing surrogate pair is a break
     74         return !U16_IS_LEAD(buf[offset - 1]);
     75     }
     76     uint32_t c1 = 0;
     77     uint32_t c2 = 0;
     78     size_t offset_back = offset;
     79     size_t offset_forward = offset;
     80     U16_PREV(buf, start, offset_back, c1);
     81     U16_NEXT(buf, offset_forward, start + count, c2);
     82     int32_t p1 = tailoredGraphemeClusterBreak(c1);
     83     int32_t p2 = tailoredGraphemeClusterBreak(c2);
     84     // Rule GB3, CR x LF
     85     if (p1 == U_GCB_CR && p2 == U_GCB_LF) {
     86         return false;
     87     }
     88     // Rule GB4, (Control | CR | LF) 
     89     if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) {
     90         return true;
     91     }
     92     // Rule GB5,  (Control | CR | LF)
     93     if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) {
     94         return true;
     95     }
     96     // Rule GB6, L x ( L | V | LV | LVT )
     97     if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) {
     98         return false;
     99     }
    100     // Rule GB7, ( LV | V ) x ( V | T )
    101     if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) {
    102         return false;
    103     }
    104     // Rule GB8, ( LVT | T ) x T
    105     if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) {
    106         return false;
    107     }
    108     // Rule GB9, x (Extend | ZWJ); Rule GB9a, x SpacingMark; Rule GB9b, Prepend x
    109     if (p2 == U_GCB_EXTEND || p2 == U_GCB_ZWJ || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) {
    110         return false;
    111     }
    112 
    113     // This is used to decide font-dependent grapheme clusters. If we don't have the advance
    114     // information, we become conservative in grapheme breaking and assume that it has no advance.
    115     const bool c2_has_advance = (advances != nullptr && advances[offset - start] != 0.0);
    116 
    117     // All the following rules are font-dependent, in the way that if we know c2 has an advance,
    118     // we definitely know that it cannot form a grapheme with the character(s) before it. So we
    119     // make the decision in favor a grapheme break early.
    120     if (c2_has_advance) {
    121         return true;
    122     }
    123 
    124     // Note: For Rule GB10 and GB11 below, we do not use the Unicode line breaking properties for
    125     // determining emoji-ness and carry our own data, because our data could be more fresh than what
    126     // ICU provides.
    127     //
    128     // Tailored version of Rule GB10, (E_Base | EBG) Extend*  E_Modifier.
    129     // The rule itself says do not break between emoji base and emoji modifiers, skipping all Extend
    130     // characters. Variation selectors are considered Extend, so they are handled fine.
    131     //
    132     // We tailor this by requiring that an actual ligature is formed. If the font doesn't form a
    133     // ligature, we allow a break before the modifier.
    134     if (isEmojiModifier(c2)) {
    135         uint32_t c0 = c1;
    136         size_t offset_backback = offset_back;
    137         int32_t p0 = p1;
    138         if (p0 == U_GCB_EXTEND && offset_backback > start) {
    139             // skip over emoji variation selector
    140             U16_PREV(buf, start, offset_backback, c0);
    141             p0 = tailoredGraphemeClusterBreak(c0);
    142         }
    143         if (isEmojiBase(c0)) {
    144             return false;
    145         }
    146     }
    147     // Tailored version of Rule GB11, ZWJ  (Glue_After_Zwj | EBG)
    148     // We try to make emoji sequences with ZWJ a single grapheme cluster, but only if they actually
    149     // merge to one cluster. So we are more relaxed than the UAX #29 rules in accepting any emoji
    150     // character after the ZWJ, but are tighter in that we only treat it as one cluster if a
    151     // ligature is actually formed and we also require the character before the ZWJ to also be an
    152     // emoji.
    153     if (p1 == U_GCB_ZWJ && isEmoji(c2) && offset_back > start) {
    154         // look at character before ZWJ to see that both can participate in an emoji zwj sequence
    155         uint32_t c0 = 0;
    156         size_t offset_backback = offset_back;
    157         U16_PREV(buf, start, offset_backback, c0);
    158         if (c0 == 0xFE0F && offset_backback > start) {
    159             // skip over emoji variation selector
    160             U16_PREV(buf, start, offset_backback, c0);
    161         }
    162         if (isEmoji(c0)) {
    163             return false;
    164         }
    165     }
    166     // Tailored version of Rule GB12 and Rule GB13 that look at even-odd cases.
    167     // sot   (RI RI)*  RI x RI
    168     // [^RI] (RI RI)*  RI x RI
    169     //
    170     // If we have font information, we have already broken the cluster if and only if the second
    171     // character had no advance, which means a ligature was formed. If we don't, we look back like
    172     // UAX #29 recommends, but only up to 1000 code units.
    173     if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
    174         if (advances != nullptr) {
    175             // We have advances information. But if we are here, we already know c2 has no advance.
    176             // So we should definitely disallow a break.
    177             return false;
    178         } else {
    179             // Look at up to 1000 code units.
    180             const size_t lookback_barrier = std::max((ssize_t)start, (ssize_t)offset_back - 1000);
    181             size_t offset_backback = offset_back;
    182             while (offset_backback > lookback_barrier) {
    183                 uint32_t c0 = 0;
    184                 U16_PREV(buf, lookback_barrier, offset_backback, c0);
    185                 if (tailoredGraphemeClusterBreak(c0) != U_GCB_REGIONAL_INDICATOR) {
    186                     offset_backback += U16_LENGTH(c0);
    187                     break;
    188                 }
    189             }
    190             // The number 4 comes from the number of code units in a whole flag.
    191             return (offset - offset_backback) % 4 == 0;
    192         }
    193     }
    194     // Cluster Indic syllables together (tailoring of UAX #29).
    195     // Immediately after each virama (that is not just a pure killer) followed by a letter, we
    196     // disallow grapheme breaks (if we are here, we don't know about advances, or we already know
    197     // that c2 has no advance).
    198     if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9  // virama
    199         && !isPureKiller(c1) &&
    200         u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
    201         return false;
    202     }
    203     // Rule GB999, Any  Any
    204     return true;
    205 }
    206 
    207 size_t GraphemeBreak::getTextRunCursor(const float* advances, const uint16_t* buf, size_t start,
    208                                        size_t count, size_t offset, MoveOpt opt) {
    209     switch (opt) {
    210         case AFTER:
    211             if (offset < start + count) {
    212                 offset++;
    213             }
    214         // fall through
    215         case AT_OR_AFTER:
    216             while (!isGraphemeBreak(advances, buf, start, count, offset)) {
    217                 offset++;
    218             }
    219             break;
    220         case BEFORE:
    221             if (offset > start) {
    222                 offset--;
    223             }
    224         // fall through
    225         case AT_OR_BEFORE:
    226             while (!isGraphemeBreak(advances, buf, start, count, offset)) {
    227                 offset--;
    228             }
    229             break;
    230         case AT:
    231             if (!isGraphemeBreak(advances, buf, start, count, offset)) {
    232                 offset = (size_t)-1;
    233             }
    234             break;
    235     }
    236     return offset;
    237 }
    238 
    239 }  // namespace minikin
    240