1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <gtest/gtest.h> 18 #include <UnicodeUtils.h> 19 #include <minikin/GraphemeBreak.h> 20 21 using namespace android; 22 23 bool IsBreak(const char* src) { 24 const size_t BUF_SIZE = 256; 25 uint16_t buf[BUF_SIZE]; 26 size_t offset; 27 size_t size; 28 ParseUnicode(buf, BUF_SIZE, src, &size, &offset); 29 return GraphemeBreak::isGraphemeBreak(buf, 0, size, offset); 30 } 31 32 TEST(GraphemeBreak, utf16) { 33 EXPECT_FALSE(IsBreak("U+D83C | U+DC31")); // emoji, U+1F431 34 35 // tests for invalid UTF-16 36 EXPECT_TRUE(IsBreak("U+D800 | U+D800")); // two leading surrogates 37 EXPECT_TRUE(IsBreak("U+DC00 | U+DC00")); // two trailing surrogates 38 EXPECT_TRUE(IsBreak("'a' | U+D800")); // lonely leading surrogate 39 EXPECT_TRUE(IsBreak("U+DC00 | 'a'")); // lonely trailing surrogate 40 EXPECT_TRUE(IsBreak("U+D800 | 'a'")); // leading surrogate followed by non-surrogate 41 EXPECT_TRUE(IsBreak("'a' | U+DC00")); // non-surrogate followed by trailing surrogate 42 } 43 44 TEST(GraphemeBreak, rules) { 45 // Rule GB1, sot ; Rule GB2, eot 46 EXPECT_TRUE(IsBreak("| 'a'")); 47 EXPECT_TRUE(IsBreak("'a' |")); 48 49 // Rule GB3, CR x LF 50 EXPECT_FALSE(IsBreak("U+000D | U+000A")); // CR x LF 51 52 // Rule GB4, (Control | CR | LF) 53 EXPECT_TRUE(IsBreak("'a' | U+2028")); // Line separator 54 EXPECT_TRUE(IsBreak("'a' | U+000D")); // LF 55 EXPECT_TRUE(IsBreak("'a' | U+000A")); // CR 56 57 // Rule GB5, (Control | CR | LF) 58 EXPECT_TRUE(IsBreak("U+2028 | 'a'")); // Line separator 59 EXPECT_TRUE(IsBreak("U+000D | 'a'")); // LF 60 EXPECT_TRUE(IsBreak("U+000A | 'a'")); // CR 61 62 // Rule GB6, L x ( L | V | LV | LVT ) 63 EXPECT_FALSE(IsBreak("U+1100 | U+1100")); // L x L 64 EXPECT_FALSE(IsBreak("U+1100 | U+1161")); // L x V 65 EXPECT_FALSE(IsBreak("U+1100 | U+AC00")); // L x LV 66 EXPECT_FALSE(IsBreak("U+1100 | U+AC01")); // L x LVT 67 68 // Rule GB7, ( LV | V ) x ( V | T ) 69 EXPECT_FALSE(IsBreak("U+AC00 | U+1161")); // LV x V 70 EXPECT_FALSE(IsBreak("U+1161 | U+1161")); // V x V 71 EXPECT_FALSE(IsBreak("U+AC00 | U+11A8")); // LV x T 72 EXPECT_FALSE(IsBreak("U+1161 | U+11A8")); // V x T 73 74 // Rule GB8, ( LVT | T ) x T 75 EXPECT_FALSE(IsBreak("U+AC01 | U+11A8")); // LVT x T 76 EXPECT_FALSE(IsBreak("U+11A8 | U+11A8")); // T x T 77 78 // Other hangul pairs not counted above _are_ breaks (GB10) 79 EXPECT_TRUE(IsBreak("U+AC00 | U+1100")); // LV x L 80 EXPECT_TRUE(IsBreak("U+AC01 | U+1100")); // LVT x L 81 EXPECT_TRUE(IsBreak("U+11A8 | U+1100")); // T x L 82 EXPECT_TRUE(IsBreak("U+11A8 | U+AC00")); // T x LV 83 EXPECT_TRUE(IsBreak("U+11A8 | U+AC01")); // T x LVT 84 85 // Rule GB8a, Regional_Indicator x Regional_Indicator 86 EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8")); 87 EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag) 88 EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag) 89 EXPECT_FALSE(IsBreak("U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag) 90 91 EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag) 92 EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag) 93 94 EXPECT_TRUE(IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag) 95 EXPECT_FALSE(IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag) 96 97 EXPECT_TRUE( 98 IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag) 99 EXPECT_FALSE( 100 IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag) 101 EXPECT_FALSE( 102 IsBreak("'a' U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag) 103 104 // Rule GB9, x Extend 105 EXPECT_FALSE(IsBreak("'a' | U+0301")); // combining accent 106 // Rule GB9a, x SpacingMark 107 EXPECT_FALSE(IsBreak("U+0915 | U+093E")); // KA, AA (spacing mark) 108 // Rule GB9b, Prepend x 109 // see tailoring test for prepend, as current ICU doesn't have any characters in the class 110 111 // Rule GB10, Any Any 112 EXPECT_TRUE(IsBreak("'a' | 'b'")); 113 EXPECT_TRUE(IsBreak("'f' | 'i'")); // probable ligature 114 EXPECT_TRUE(IsBreak("U+0644 | U+0627")); // probable ligature, lam + alef 115 EXPECT_TRUE(IsBreak("U+4E00 | U+4E00")); // CJK ideographs 116 EXPECT_TRUE(IsBreak("'a' | U+1F1FA U+1F1F8")); // Regional indicator pair (flag) 117 EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | 'a'")); // Regional indicator pair (flag) 118 } 119 120 TEST(GraphemeBreak, tailoring) { 121 // control characters that we interpret as "extend" 122 EXPECT_FALSE(IsBreak("'a' | U+00AD")); // soft hyphen 123 EXPECT_FALSE(IsBreak("'a' | U+200B")); // zwsp 124 EXPECT_FALSE(IsBreak("'a' | U+200E")); // lrm 125 EXPECT_FALSE(IsBreak("'a' | U+202A")); // lre 126 EXPECT_FALSE(IsBreak("'a' | U+E0041")); // tag character 127 128 // UTC-approved characters for the Prepend class 129 EXPECT_FALSE(IsBreak("U+06DD | U+0661")); // arabic subtending mark + digit one 130 131 EXPECT_TRUE(IsBreak("U+0E01 | U+0E33")); // Thai sara am 132 133 // virama is not a grapheme break, but "pure killer" is 134 EXPECT_FALSE(IsBreak("U+0915 | U+094D U+0915")); // Devanagari ka+virama+ka 135 EXPECT_FALSE(IsBreak("U+0915 U+094D | U+0915")); // Devanagari ka+virama+ka 136 EXPECT_FALSE(IsBreak("U+0E01 | U+0E3A U+0E01")); // thai phinthu = pure killer 137 EXPECT_TRUE(IsBreak("U+0E01 U+0E3A | U+0E01")); // thai phinthu = pure killer 138 139 // suppress grapheme breaks in zwj emoji sequences, see 140 // http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html 141 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2764 U+FE0F U+200D U+1F48B U+200D U+1F468")); 142 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D | U+1F48B U+200D U+1F468")); 143 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D U+1F48B U+200D | U+1F468")); 144 EXPECT_FALSE(IsBreak("U+1F468 U+200D | U+1F469 U+200D U+1F466")); 145 EXPECT_FALSE(IsBreak("U+1F468 U+200D U+1F469 U+200D | U+1F466")); 146 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F469 U+200D U+1F467 U+200D U+1F466")); 147 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D | U+1F467 U+200D U+1F466")); 148 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D U+1F467 U+200D | U+1F466")); 149 EXPECT_FALSE(IsBreak("U+1F441 U+200D | U+1F5E8")); 150 151 // Do not break before and after zwj with all kind of emoji characters. 152 EXPECT_FALSE(IsBreak("U+1F431 | U+200D U+1F464")); 153 EXPECT_FALSE(IsBreak("U+1F431 U+200D | U+1F464")); 154 155 // ARABIC LETTER BEH + ZWJ + heart, not a zwj emoji sequence, so we preserve the break 156 EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764")); 157 } 158 159 TEST(GraphemeBreak, emojiModifiers) { 160 EXPECT_FALSE(IsBreak("U+261D | U+1F3FB")); // white up pointing index + modifier 161 EXPECT_FALSE(IsBreak("U+270C | U+1F3FB")); // victory hand + modifier 162 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FB")); // boy + modifier 163 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FC")); // boy + modifier 164 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FD")); // boy + modifier 165 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FE")); // boy + modifier 166 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FF")); // boy + modifier 167 EXPECT_FALSE(IsBreak("U+1F918 | U+1F3FF")); // sign of the horns + modifier 168 EXPECT_FALSE(IsBreak("U+1F933 | U+1F3FF")); // selfie (Unicode 9) + modifier 169 170 // adding emoji style variation selector doesn't affect grapheme cluster 171 EXPECT_TRUE(IsBreak("U+270C U+FE0E | U+1F3FB")); // victory hand + text style + modifier 172 EXPECT_FALSE(IsBreak("U+270C U+FE0F | U+1F3FB")); // heart + emoji style + modifier 173 174 // heart is not an emoji base 175 EXPECT_TRUE(IsBreak("U+2764 | U+1F3FB")); // heart + modifier 176 EXPECT_TRUE(IsBreak("U+2764 U+FE0E | U+1F3FB")); // heart + emoji style + modifier 177 EXPECT_TRUE(IsBreak("U+2764 U+FE0F | U+1F3FB")); // heart + emoji style + modifier 178 EXPECT_TRUE(IsBreak("U+1F3FB | U+1F3FB")); // modifier + modifier 179 180 // rat is not an emoji modifer 181 EXPECT_TRUE(IsBreak("U+1F466 | U+1F400")); // boy + rat 182 } 183 184 TEST(GraphemeBreak, offsets) { 185 uint16_t string[] = { 0x0041, 0x06DD, 0x0045, 0x0301, 0x0049, 0x0301 }; 186 EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 2)); 187 EXPECT_FALSE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 3)); 188 EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 4)); 189 EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 5)); 190 } 191