1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <gtest/gtest.h> 18 #include "ICUTestBase.h" 19 #include "UnicodeUtils.h" 20 #include <minikin/WordBreaker.h> 21 #include <unicode/locid.h> 22 #include <unicode/uclean.h> 23 #include <unicode/udata.h> 24 25 #define LOG_TAG "Minikin" 26 #include <cutils/log.h> 27 28 #ifndef NELEM 29 #define NELEM(x) ((sizeof(x) / sizeof((x)[0]))) 30 #endif 31 32 #define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint) 33 34 using namespace android; 35 36 typedef ICUTestBase WordBreakerTest; 37 38 TEST_F(WordBreakerTest, basic) { 39 uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'}; 40 WordBreaker breaker; 41 breaker.setLocale(icu::Locale::getEnglish()); 42 breaker.setText(buf, NELEM(buf)); 43 EXPECT_EQ(0, breaker.current()); 44 EXPECT_EQ(6, breaker.next()); // after "hello " 45 EXPECT_EQ(0, breaker.wordStart()); // "hello" 46 EXPECT_EQ(5, breaker.wordEnd()); 47 EXPECT_EQ(0, breaker.breakBadness()); 48 EXPECT_EQ(6, breaker.current()); 49 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 50 EXPECT_EQ(6, breaker.wordStart()); // "world" 51 EXPECT_EQ(11, breaker.wordEnd()); 52 EXPECT_EQ(0, breaker.breakBadness()); 53 EXPECT_EQ(11, breaker.current()); 54 } 55 56 TEST_F(WordBreakerTest, softHyphen) { 57 uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'}; 58 WordBreaker breaker; 59 breaker.setLocale(icu::Locale::getEnglish()); 60 breaker.setText(buf, NELEM(buf)); 61 EXPECT_EQ(0, breaker.current()); 62 EXPECT_EQ(7, breaker.next()); // after "hel{SOFT HYPHEN}lo " 63 EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo" 64 EXPECT_EQ(6, breaker.wordEnd()); 65 EXPECT_EQ(0, breaker.breakBadness()); 66 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 67 EXPECT_EQ(7, breaker.wordStart()); // "world" 68 EXPECT_EQ(12, breaker.wordEnd()); 69 EXPECT_EQ(0, breaker.breakBadness()); 70 } 71 72 TEST_F(WordBreakerTest, postfixAndPrefix) { 73 uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US JP 74 WordBreaker breaker; 75 breaker.setLocale(icu::Locale::getEnglish()); 76 breaker.setText(buf, NELEM(buf)); 77 EXPECT_EQ(0, breaker.current()); 78 79 EXPECT_EQ(4, breaker.next()); // after CENT SIGN 80 EXPECT_EQ(0, breaker.wordStart()); // "US" 81 EXPECT_EQ(3, breaker.wordEnd()); 82 83 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string 84 EXPECT_EQ(4, breaker.wordStart()); // "JP" 85 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd()); 86 } 87 88 TEST_F(WordBreakerTest, MyanmarKinzi) { 89 uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C}; // NGA, ASAT, VIRAMA, KA, UU 90 WordBreaker breaker; 91 icu::Locale burmese("my"); 92 breaker.setLocale(burmese); 93 breaker.setText(buf, NELEM(buf)); 94 EXPECT_EQ(0, breaker.current()); 95 96 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string 97 EXPECT_EQ(0, breaker.wordStart()); 98 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd()); 99 } 100 101 TEST_F(WordBreakerTest, zwjEmojiSequences) { 102 uint16_t buf[] = { 103 // man + zwj + heart + zwj + man 104 UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468), 105 // woman + zwj + heart + zwj + kiss mark + zwj + woman 106 UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469), 107 // eye + zwj + left speech bubble 108 UTF16(0x1F441), 0x200D, UTF16(0x1F5E8), 109 // CAT FACE + zwj + BUST IN SILHOUETTE 110 UTF16(0x1F431), 0x200D, UTF16(0x1F464), 111 }; 112 WordBreaker breaker; 113 breaker.setLocale(icu::Locale::getEnglish()); 114 breaker.setText(buf, NELEM(buf)); 115 EXPECT_EQ(0, breaker.current()); 116 EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man 117 EXPECT_EQ(0, breaker.wordStart()); 118 EXPECT_EQ(7, breaker.wordEnd()); 119 EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman 120 EXPECT_EQ(7, breaker.wordStart()); 121 EXPECT_EQ(17, breaker.wordEnd()); 122 EXPECT_EQ(22, breaker.next()); // after eye + zwj + left speech bubble 123 EXPECT_EQ(17, breaker.wordStart()); 124 EXPECT_EQ(22, breaker.wordEnd()); 125 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 126 EXPECT_EQ(22, breaker.wordStart()); 127 EXPECT_EQ(27, breaker.wordEnd()); 128 } 129 130 TEST_F(WordBreakerTest, emojiWithModifier) { 131 uint16_t buf[] = { 132 UTF16(0x1F466), UTF16(0x1F3FB), // boy + type 1-2 fitzpatrick modifier 133 0x270C, 0xFE0F, UTF16(0x1F3FF) // victory hand + emoji style + type 6 fitzpatrick modifier 134 }; 135 WordBreaker breaker; 136 breaker.setLocale(icu::Locale::getEnglish()); 137 breaker.setText(buf, NELEM(buf)); 138 EXPECT_EQ(0, breaker.current()); 139 EXPECT_EQ(4, breaker.next()); // after man + type 6 fitzpatrick modifier 140 EXPECT_EQ(0, breaker.wordStart()); 141 EXPECT_EQ(4, breaker.wordEnd()); 142 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 143 EXPECT_EQ(4, breaker.wordStart()); 144 EXPECT_EQ(8, breaker.wordEnd()); 145 } 146 147 TEST_F(WordBreakerTest, punct) { 148 uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd', 149 '!', '!'}; 150 WordBreaker breaker; 151 breaker.setLocale(icu::Locale::getEnglish()); 152 breaker.setText(buf, NELEM(buf)); 153 EXPECT_EQ(0, breaker.current()); 154 EXPECT_EQ(9, breaker.next()); // after "hello, " 155 EXPECT_EQ(2, breaker.wordStart()); // "hello" 156 EXPECT_EQ(7, breaker.wordEnd()); 157 EXPECT_EQ(0, breaker.breakBadness()); 158 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 159 EXPECT_EQ(9, breaker.wordStart()); // "world" 160 EXPECT_EQ(14, breaker.wordEnd()); 161 EXPECT_EQ(0, breaker.breakBadness()); 162 } 163 164 TEST_F(WordBreakerTest, email) { 165 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 166 ' ', 'x'}; 167 WordBreaker breaker; 168 breaker.setLocale(icu::Locale::getEnglish()); 169 breaker.setText(buf, NELEM(buf)); 170 EXPECT_EQ(0, breaker.current()); 171 EXPECT_EQ(11, breaker.next()); // after "foo@example" 172 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 173 EXPECT_EQ(1, breaker.breakBadness()); 174 EXPECT_EQ(16, breaker.next()); // after ".com " 175 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 176 EXPECT_EQ(0, breaker.breakBadness()); 177 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 178 EXPECT_EQ(16, breaker.wordStart()); // "x" 179 EXPECT_EQ(17, breaker.wordEnd()); 180 EXPECT_EQ(0, breaker.breakBadness()); 181 } 182 183 TEST_F(WordBreakerTest, mailto) { 184 uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 185 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'}; 186 WordBreaker breaker; 187 breaker.setLocale(icu::Locale::getEnglish()); 188 breaker.setText(buf, NELEM(buf)); 189 EXPECT_EQ(0, breaker.current()); 190 EXPECT_EQ(7, breaker.next()); // after "mailto:" 191 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 192 EXPECT_EQ(1, breaker.breakBadness()); 193 EXPECT_EQ(18, breaker.next()); // after "foo@example" 194 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 195 EXPECT_EQ(1, breaker.breakBadness()); 196 EXPECT_EQ(23, breaker.next()); // after ".com " 197 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 198 EXPECT_EQ(0, breaker.breakBadness()); 199 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 200 EXPECT_EQ(23, breaker.wordStart()); // "x" 201 EXPECT_EQ(24, breaker.wordEnd()); 202 EXPECT_EQ(0, breaker.breakBadness()); 203 } 204 205 // The current logic always places a line break after a detected email address or URL 206 // and an immediately following non-ASCII character. 207 TEST_F(WordBreakerTest, emailNonAscii) { 208 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 209 0x4E00}; 210 WordBreaker breaker; 211 breaker.setLocale(icu::Locale::getEnglish()); 212 breaker.setText(buf, NELEM(buf)); 213 EXPECT_EQ(0, breaker.current()); 214 EXPECT_EQ(11, breaker.next()); // after "foo@example" 215 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 216 EXPECT_EQ(1, breaker.breakBadness()); 217 EXPECT_EQ(15, breaker.next()); // after ".com" 218 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 219 EXPECT_EQ(0, breaker.breakBadness()); 220 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 221 EXPECT_EQ(15, breaker.wordStart()); // "" 222 EXPECT_EQ(16, breaker.wordEnd()); 223 EXPECT_EQ(0, breaker.breakBadness()); 224 } 225 226 TEST_F(WordBreakerTest, emailCombining) { 227 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', 228 0x0303, ' ', 'x'}; 229 WordBreaker breaker; 230 breaker.setLocale(icu::Locale::getEnglish()); 231 breaker.setText(buf, NELEM(buf)); 232 EXPECT_EQ(0, breaker.current()); 233 EXPECT_EQ(11, breaker.next()); // after "foo@example" 234 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 235 EXPECT_EQ(1, breaker.breakBadness()); 236 EXPECT_EQ(17, breaker.next()); // after ".com " 237 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 238 EXPECT_EQ(0, breaker.breakBadness()); 239 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 240 EXPECT_EQ(17, breaker.wordStart()); // "x" 241 EXPECT_EQ(18, breaker.wordEnd()); 242 EXPECT_EQ(0, breaker.breakBadness()); 243 } 244 245 TEST_F(WordBreakerTest, lonelyAt) { 246 uint16_t buf[] = {'a', ' ', '@', ' ', 'b'}; 247 WordBreaker breaker; 248 breaker.setLocale(icu::Locale::getEnglish()); 249 breaker.setText(buf, NELEM(buf)); 250 EXPECT_EQ(0, breaker.current()); 251 EXPECT_EQ(2, breaker.next()); // after "a " 252 EXPECT_EQ(0, breaker.wordStart()); // "a" 253 EXPECT_EQ(1, breaker.wordEnd()); 254 EXPECT_EQ(0, breaker.breakBadness()); 255 EXPECT_EQ(4, breaker.next()); // after "@ " 256 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 257 EXPECT_EQ(0, breaker.breakBadness()); 258 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 259 EXPECT_EQ(4, breaker.wordStart()); // "b" 260 EXPECT_EQ(5, breaker.wordEnd()); 261 EXPECT_EQ(0, breaker.breakBadness()); 262 } 263 264 TEST_F(WordBreakerTest, url) { 265 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e', 266 '.', 'c', 'o', 'm', ' ', 'x'}; 267 WordBreaker breaker; 268 breaker.setLocale(icu::Locale::getEnglish()); 269 breaker.setText(buf, NELEM(buf)); 270 EXPECT_EQ(0, breaker.current()); 271 EXPECT_EQ(5, breaker.next()); // after "http:" 272 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 273 EXPECT_EQ(1, breaker.breakBadness()); 274 EXPECT_EQ(7, breaker.next()); // after "//" 275 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 276 EXPECT_EQ(1, breaker.breakBadness()); 277 EXPECT_EQ(14, breaker.next()); // after "example" 278 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 279 EXPECT_EQ(1, breaker.breakBadness()); 280 EXPECT_EQ(19, breaker.next()); // after ".com " 281 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 282 EXPECT_EQ(0, breaker.breakBadness()); 283 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 284 EXPECT_EQ(19, breaker.wordStart()); // "x" 285 EXPECT_EQ(20, breaker.wordEnd()); 286 EXPECT_EQ(0, breaker.breakBadness()); 287 } 288 289 // Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks* 290 TEST_F(WordBreakerTest, urlBreakChars) { 291 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd', 292 '-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'}; 293 WordBreaker breaker; 294 breaker.setLocale(icu::Locale::getEnglish()); 295 breaker.setText(buf, NELEM(buf)); 296 EXPECT_EQ(0, breaker.current()); 297 EXPECT_EQ(5, breaker.next()); // after "http:" 298 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 299 EXPECT_EQ(1, breaker.breakBadness()); 300 EXPECT_EQ(7, breaker.next()); // after "//" 301 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 302 EXPECT_EQ(1, breaker.breakBadness()); 303 EXPECT_EQ(8, breaker.next()); // after "a" 304 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 305 EXPECT_EQ(1, breaker.breakBadness()); 306 EXPECT_EQ(10, breaker.next()); // after ".b" 307 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 308 EXPECT_EQ(1, breaker.breakBadness()); 309 EXPECT_EQ(11, breaker.next()); // after "/" 310 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 311 EXPECT_EQ(1, breaker.breakBadness()); 312 EXPECT_EQ(13, breaker.next()); // after "~c" 313 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 314 EXPECT_EQ(1, breaker.breakBadness()); 315 EXPECT_EQ(15, breaker.next()); // after ",d" 316 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 317 EXPECT_EQ(1, breaker.breakBadness()); 318 EXPECT_EQ(17, breaker.next()); // after "-e" 319 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 320 EXPECT_EQ(1, breaker.breakBadness()); 321 EXPECT_EQ(19, breaker.next()); // after "?f" 322 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 323 EXPECT_EQ(1, breaker.breakBadness()); 324 EXPECT_EQ(20, breaker.next()); // after "=" 325 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 326 EXPECT_EQ(1, breaker.breakBadness()); 327 EXPECT_EQ(21, breaker.next()); // after "g" 328 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 329 EXPECT_EQ(1, breaker.breakBadness()); 330 EXPECT_EQ(22, breaker.next()); // after "&" 331 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 332 EXPECT_EQ(1, breaker.breakBadness()); 333 EXPECT_EQ(23, breaker.next()); // after "h" 334 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 335 EXPECT_EQ(1, breaker.breakBadness()); 336 EXPECT_EQ(25, breaker.next()); // after "#i" 337 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 338 EXPECT_EQ(1, breaker.breakBadness()); 339 EXPECT_EQ(27, breaker.next()); // after "%j" 340 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 341 EXPECT_EQ(1, breaker.breakBadness()); 342 EXPECT_EQ(29, breaker.next()); // after "_k" 343 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 344 EXPECT_EQ(1, breaker.breakBadness()); 345 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 346 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 347 EXPECT_EQ(0, breaker.breakBadness()); 348 } 349 350 TEST_F(WordBreakerTest, urlNoHyphenBreak) { 351 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'}; 352 WordBreaker breaker; 353 breaker.setLocale(icu::Locale::getEnglish()); 354 breaker.setText(buf, NELEM(buf)); 355 EXPECT_EQ(0, breaker.current()); 356 EXPECT_EQ(5, breaker.next()); // after "http:" 357 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 358 EXPECT_EQ(7, breaker.next()); // after "//" 359 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 360 EXPECT_EQ(8, breaker.next()); // after "a" 361 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 362 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 363 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 364 } 365 366 TEST_F(WordBreakerTest, urlEndsWithSlash) { 367 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'}; 368 WordBreaker breaker; 369 breaker.setLocale(icu::Locale::getEnglish()); 370 breaker.setText(buf, NELEM(buf)); 371 EXPECT_EQ(0, breaker.current()); 372 EXPECT_EQ(5, breaker.next()); // after "http:" 373 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 374 EXPECT_EQ(7, breaker.next()); // after "//" 375 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 376 EXPECT_EQ(8, breaker.next()); // after "a" 377 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 378 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 379 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 380 } 381 382 TEST_F(WordBreakerTest, emailStartsWithSlash) { 383 uint16_t buf[] = {'/', 'a', '@', 'b'}; 384 WordBreaker breaker; 385 breaker.setLocale(icu::Locale::getEnglish()); 386 breaker.setText(buf, NELEM(buf)); 387 EXPECT_EQ(0, breaker.current()); 388 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end 389 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd()); 390 } 391