Home | History | Annotate | Download | only in minikin
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <stdint.h>
     18 #include <algorithm>
     19 #include <unicode/uchar.h>
     20 #include <unicode/utf16.h>
     21 
     22 #include <minikin/GraphemeBreak.h>
     23 #include "MinikinInternal.h"
     24 
     25 namespace android {
     26 
     27 int32_t tailoredGraphemeClusterBreak(uint32_t c) {
     28     // Characters defined as Control that we want to treat them as Extend.
     29     // These are curated manually.
     30     if (c == 0x00AD                         // SHY
     31             || c == 0x061C                  // ALM
     32             || c == 0x180E                  // MONGOLIAN VOWEL SEPARATOR
     33             || c == 0x200B                  // ZWSP
     34             || c == 0x200E                  // LRM
     35             || c == 0x200F                  // RLM
     36             || (0x202A <= c && c <= 0x202E) // LRE, RLE, PDF, LRO, RLO
     37             || ((c | 0xF) == 0x206F)        // WJ, invisible math operators, LRI, RLI, FSI, PDI,
     38                                             // and the deprecated invisible format controls
     39             || c == 0xFEFF                  // BOM
     40             || ((c | 0x7F) == 0xE007F))     // recently undeprecated tag characters in Plane 14
     41         return U_GCB_EXTEND;
     42     // UTC-approved characters for the Prepend class, per
     43     // http://www.unicode.org/L2/L2015/15183r-graph-cluster-brk.txt
     44     // These should be removed when our copy of ICU gets updated to Unicode 9.0 (~2016 or 2017).
     45     else if ((0x0600 <= c && c <= 0x0605) // Arabic subtending marks
     46             || c == 0x06DD                // ARABIC SUBTENDING MARK
     47             || c == 0x070F                // SYRIAC ABBREVIATION MARK
     48             || c == 0x0D4E                // MALAYALAM LETTER DOT REPH
     49             || c == 0x110BD               // KAITHI NUMBER SIGN
     50             || c == 0x111C2               // SHARADA SIGN JIHVAMULIYA
     51             || c == 0x111C3)              // SHARADA SIGN UPADHMANIYA
     52         return U_GCB_PREPEND;
     53     // THAI CHARACTER SARA AM is treated as a normal letter by most other implementations: they
     54     // allow a grapheme break before it.
     55     else if (c == 0x0E33)
     56         return U_GCB_OTHER;
     57     else
     58         return u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
     59 }
     60 
     61 // Returns true for all characters whose IndicSyllabicCategory is Pure_Killer.
     62 // From http://www.unicode.org/Public/8.0.0/ucd/IndicSyllabicCategory.txt
     63 bool isPureKiller(uint32_t c) {
     64     return (c == 0x0E3A || c == 0x0E4E || c == 0x0F84 || c == 0x103A || c == 0x1714 || c == 0x1734
     65             || c == 0x17D1 || c == 0x1BAA || c == 0x1BF2 || c == 0x1BF3 || c == 0xA806
     66             || c == 0xA953 || c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B);
     67 }
     68 
     69 bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count,
     70         size_t offset) {
     71     // This implementation closely follows Unicode Standard Annex #29 on
     72     // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/),
     73     // implementing a tailored version of extended grapheme clusters.
     74     // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules.
     75 
     76     // Rule GB1, sot ; Rule GB2,  eot
     77     if (offset <= start || offset >= start + count) {
     78         return true;
     79     }
     80     if (U16_IS_TRAIL(buf[offset])) {
     81         // Don't break a surrogate pair, but a lonely trailing surrogate pair is a break
     82         return !U16_IS_LEAD(buf[offset - 1]);
     83     }
     84     uint32_t c1 = 0;
     85     uint32_t c2 = 0;
     86     size_t offset_back = offset;
     87     U16_PREV(buf, start, offset_back, c1);
     88     U16_NEXT(buf, offset, start + count, c2);
     89     int32_t p1 = tailoredGraphemeClusterBreak(c1);
     90     int32_t p2 = tailoredGraphemeClusterBreak(c2);
     91     // Rule GB3, CR x LF
     92     if (p1 == U_GCB_CR && p2 == U_GCB_LF) {
     93         return false;
     94     }
     95     // Rule GB4, (Control | CR | LF) 
     96     if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) {
     97         return true;
     98     }
     99     // Rule GB5,  (Control | CR | LF)
    100     if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) {
    101         return true;
    102     }
    103     // Rule GB6, L x ( L | V | LV | LVT )
    104     if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) {
    105         return false;
    106     }
    107     // Rule GB7, ( LV | V ) x ( V | T )
    108     if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) {
    109         return false;
    110     }
    111     // Rule GB8, ( LVT | T ) x T
    112     if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) {
    113         return false;
    114     }
    115     // Rule GB8a that looks at even-off cases.
    116     //
    117     // sot   (RI RI)*  RI x RI
    118     // [^RI] (RI RI)*  RI x RI
    119     //                 RI  RI
    120     if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
    121         // Look at up to 1000 code units.
    122         start = std::max((ssize_t)start, (ssize_t)offset_back - 1000);
    123         while (offset_back > start) {
    124             U16_PREV(buf, start, offset_back, c1);
    125             if (tailoredGraphemeClusterBreak(c1) != U_GCB_REGIONAL_INDICATOR) {
    126                 offset_back += U16_LENGTH(c1);
    127                 break;
    128             }
    129         }
    130 
    131         // Note that the offset has moved forwared 2 code units by U16_NEXT.
    132         // The number 4 comes from the number of code units in a whole flag.
    133         return (offset - 2 - offset_back) % 4 == 0;
    134     }
    135     // Rule GB9, x Extend; Rule GB9a, x SpacingMark; Rule GB9b, Prepend x
    136     if (p2 == U_GCB_EXTEND || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) {
    137         return false;
    138     }
    139     // Cluster indic syllables together (tailoring of UAX #29)
    140     // Known limitation: this is overly conservative, and assumes that the virama may form a
    141     // conjunct with the following letter, which doesn't always happen.
    142     //
    143     // There is no easy solution to do this correctly. Even querying the font does not help (with
    144     // the current font technoloies), since the font may be creating the conjunct using multiple
    145     // glyphs, while the user may be perceiving that sequence of glyphs as one conjunct or one
    146     // letter.
    147     if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9  // virama
    148             && !isPureKiller(c1)
    149             && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
    150         return false;
    151     }
    152     // Tailoring: make emoji sequences with ZWJ a single grapheme cluster
    153     if (c1 == 0x200D && isEmoji(c2) && offset_back > start) {
    154         // look at character before ZWJ to see that both can participate in an emoji zwj sequence
    155         uint32_t c0 = 0;
    156         U16_PREV(buf, start, offset_back, c0);
    157         if (c0 == 0xFE0F && offset_back > start) {
    158             // skip over emoji variation selector
    159             U16_PREV(buf, start, offset_back, c0);
    160         }
    161         if (isEmoji(c0)) {
    162             return false;
    163         }
    164     }
    165     // Proposed Rule GB9c from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
    166     // E_Base x E_Modifier
    167     if (isEmojiModifier(c2)) {
    168         if (c1 == 0xFE0F && offset_back > start) {
    169             // skip over emoji variation selector
    170             U16_PREV(buf, start, offset_back, c1);
    171         }
    172         if (isEmojiBase(c1)) {
    173             return false;
    174         }
    175     }
    176     // Rule GB10, Any  Any
    177     return true;
    178 }
    179 
    180 size_t GraphemeBreak::getTextRunCursor(const uint16_t* buf, size_t start, size_t count,
    181         size_t offset, MoveOpt opt) {
    182     switch (opt) {
    183     case AFTER:
    184         if (offset < start + count) {
    185             offset++;
    186         }
    187         // fall through
    188     case AT_OR_AFTER:
    189         while (!isGraphemeBreak(buf, start, count, offset)) {
    190             offset++;
    191         }
    192         break;
    193     case BEFORE:
    194         if (offset > start) {
    195             offset--;
    196         }
    197         // fall through
    198     case AT_OR_BEFORE:
    199         while (!isGraphemeBreak(buf, start, count, offset)) {
    200             offset--;
    201         }
    202         break;
    203     case AT:
    204         if (!isGraphemeBreak(buf, start, count, offset)) {
    205             offset = (size_t)-1;
    206         }
    207         break;
    208     }
    209     return offset;
    210 }
    211 
    212 }  // namespace android
    213