1 /* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <stdint.h> 18 #include <algorithm> 19 #include <unicode/uchar.h> 20 #include <unicode/utf16.h> 21 22 #include <minikin/GraphemeBreak.h> 23 #include "MinikinInternal.h" 24 25 namespace android { 26 27 int32_t tailoredGraphemeClusterBreak(uint32_t c) { 28 // Characters defined as Control that we want to treat them as Extend. 29 // These are curated manually. 30 if (c == 0x00AD // SHY 31 || c == 0x061C // ALM 32 || c == 0x180E // MONGOLIAN VOWEL SEPARATOR 33 || c == 0x200B // ZWSP 34 || c == 0x200E // LRM 35 || c == 0x200F // RLM 36 || (0x202A <= c && c <= 0x202E) // LRE, RLE, PDF, LRO, RLO 37 || ((c | 0xF) == 0x206F) // WJ, invisible math operators, LRI, RLI, FSI, PDI, 38 // and the deprecated invisible format controls 39 || c == 0xFEFF // BOM 40 || ((c | 0x7F) == 0xE007F)) // recently undeprecated tag characters in Plane 14 41 return U_GCB_EXTEND; 42 // UTC-approved characters for the Prepend class, per 43 // http://www.unicode.org/L2/L2015/15183r-graph-cluster-brk.txt 44 // These should be removed when our copy of ICU gets updated to Unicode 9.0 (~2016 or 2017). 45 else if ((0x0600 <= c && c <= 0x0605) // Arabic subtending marks 46 || c == 0x06DD // ARABIC SUBTENDING MARK 47 || c == 0x070F // SYRIAC ABBREVIATION MARK 48 || c == 0x0D4E // MALAYALAM LETTER DOT REPH 49 || c == 0x110BD // KAITHI NUMBER SIGN 50 || c == 0x111C2 // SHARADA SIGN JIHVAMULIYA 51 || c == 0x111C3) // SHARADA SIGN UPADHMANIYA 52 return U_GCB_PREPEND; 53 // THAI CHARACTER SARA AM is treated as a normal letter by most other implementations: they 54 // allow a grapheme break before it. 55 else if (c == 0x0E33) 56 return U_GCB_OTHER; 57 else 58 return u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 59 } 60 61 // Returns true for all characters whose IndicSyllabicCategory is Pure_Killer. 62 // From http://www.unicode.org/Public/8.0.0/ucd/IndicSyllabicCategory.txt 63 bool isPureKiller(uint32_t c) { 64 return (c == 0x0E3A || c == 0x0E4E || c == 0x0F84 || c == 0x103A || c == 0x1714 || c == 0x1734 65 || c == 0x17D1 || c == 0x1BAA || c == 0x1BF2 || c == 0x1BF3 || c == 0xA806 66 || c == 0xA953 || c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B); 67 } 68 69 bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count, 70 size_t offset) { 71 // This implementation closely follows Unicode Standard Annex #29 on 72 // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/), 73 // implementing a tailored version of extended grapheme clusters. 74 // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules. 75 76 // Rule GB1, sot ; Rule GB2, eot 77 if (offset <= start || offset >= start + count) { 78 return true; 79 } 80 if (U16_IS_TRAIL(buf[offset])) { 81 // Don't break a surrogate pair, but a lonely trailing surrogate pair is a break 82 return !U16_IS_LEAD(buf[offset - 1]); 83 } 84 uint32_t c1 = 0; 85 uint32_t c2 = 0; 86 size_t offset_back = offset; 87 U16_PREV(buf, start, offset_back, c1); 88 U16_NEXT(buf, offset, start + count, c2); 89 int32_t p1 = tailoredGraphemeClusterBreak(c1); 90 int32_t p2 = tailoredGraphemeClusterBreak(c2); 91 // Rule GB3, CR x LF 92 if (p1 == U_GCB_CR && p2 == U_GCB_LF) { 93 return false; 94 } 95 // Rule GB4, (Control | CR | LF) 96 if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) { 97 return true; 98 } 99 // Rule GB5, (Control | CR | LF) 100 if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) { 101 return true; 102 } 103 // Rule GB6, L x ( L | V | LV | LVT ) 104 if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) { 105 return false; 106 } 107 // Rule GB7, ( LV | V ) x ( V | T ) 108 if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) { 109 return false; 110 } 111 // Rule GB8, ( LVT | T ) x T 112 if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) { 113 return false; 114 } 115 // Rule GB8a that looks at even-off cases. 116 // 117 // sot (RI RI)* RI x RI 118 // [^RI] (RI RI)* RI x RI 119 // RI RI 120 if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) { 121 // Look at up to 1000 code units. 122 start = std::max((ssize_t)start, (ssize_t)offset_back - 1000); 123 while (offset_back > start) { 124 U16_PREV(buf, start, offset_back, c1); 125 if (tailoredGraphemeClusterBreak(c1) != U_GCB_REGIONAL_INDICATOR) { 126 offset_back += U16_LENGTH(c1); 127 break; 128 } 129 } 130 131 // Note that the offset has moved forwared 2 code units by U16_NEXT. 132 // The number 4 comes from the number of code units in a whole flag. 133 return (offset - 2 - offset_back) % 4 == 0; 134 } 135 // Rule GB9, x Extend; Rule GB9a, x SpacingMark; Rule GB9b, Prepend x 136 if (p2 == U_GCB_EXTEND || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) { 137 return false; 138 } 139 // Cluster indic syllables together (tailoring of UAX #29) 140 // Known limitation: this is overly conservative, and assumes that the virama may form a 141 // conjunct with the following letter, which doesn't always happen. 142 // 143 // There is no easy solution to do this correctly. Even querying the font does not help (with 144 // the current font technoloies), since the font may be creating the conjunct using multiple 145 // glyphs, while the user may be perceiving that sequence of glyphs as one conjunct or one 146 // letter. 147 if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9 // virama 148 && !isPureKiller(c1) 149 && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) { 150 return false; 151 } 152 // Tailoring: make emoji sequences with ZWJ a single grapheme cluster 153 if (c1 == 0x200D && isEmoji(c2) && offset_back > start) { 154 // look at character before ZWJ to see that both can participate in an emoji zwj sequence 155 uint32_t c0 = 0; 156 U16_PREV(buf, start, offset_back, c0); 157 if (c0 == 0xFE0F && offset_back > start) { 158 // skip over emoji variation selector 159 U16_PREV(buf, start, offset_back, c0); 160 } 161 if (isEmoji(c0)) { 162 return false; 163 } 164 } 165 // Proposed Rule GB9c from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf 166 // E_Base x E_Modifier 167 if (isEmojiModifier(c2)) { 168 if (c1 == 0xFE0F && offset_back > start) { 169 // skip over emoji variation selector 170 U16_PREV(buf, start, offset_back, c1); 171 } 172 if (isEmojiBase(c1)) { 173 return false; 174 } 175 } 176 // Rule GB10, Any Any 177 return true; 178 } 179 180 size_t GraphemeBreak::getTextRunCursor(const uint16_t* buf, size_t start, size_t count, 181 size_t offset, MoveOpt opt) { 182 switch (opt) { 183 case AFTER: 184 if (offset < start + count) { 185 offset++; 186 } 187 // fall through 188 case AT_OR_AFTER: 189 while (!isGraphemeBreak(buf, start, count, offset)) { 190 offset++; 191 } 192 break; 193 case BEFORE: 194 if (offset > start) { 195 offset--; 196 } 197 // fall through 198 case AT_OR_BEFORE: 199 while (!isGraphemeBreak(buf, start, count, offset)) { 200 offset--; 201 } 202 break; 203 case AT: 204 if (!isGraphemeBreak(buf, start, count, offset)) { 205 offset = (size_t)-1; 206 } 207 break; 208 } 209 return offset; 210 } 211 212 } // namespace android 213