Home | History | Annotate | Download | only in tests
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <gtest/gtest.h>
     18 #include "ICUTestBase.h"
     19 #include "UnicodeUtils.h"
     20 #include <minikin/WordBreaker.h>
     21 #include <unicode/locid.h>
     22 #include <unicode/uclean.h>
     23 #include <unicode/udata.h>
     24 
     25 #define LOG_TAG "Minikin"
     26 #include <cutils/log.h>
     27 
     28 #ifndef NELEM
     29 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
     30 #endif
     31 
     32 #define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
     33 
     34 using namespace android;
     35 
     36 typedef ICUTestBase WordBreakerTest;
     37 
     38 TEST_F(WordBreakerTest, basic) {
     39     uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
     40     WordBreaker breaker;
     41     breaker.setLocale(icu::Locale::getEnglish());
     42     breaker.setText(buf, NELEM(buf));
     43     EXPECT_EQ(0, breaker.current());
     44     EXPECT_EQ(6, breaker.next());  // after "hello "
     45     EXPECT_EQ(0, breaker.wordStart());  // "hello"
     46     EXPECT_EQ(5, breaker.wordEnd());
     47     EXPECT_EQ(0, breaker.breakBadness());
     48     EXPECT_EQ(6, breaker.current());
     49     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
     50     EXPECT_EQ(6, breaker.wordStart());  // "world"
     51     EXPECT_EQ(11, breaker.wordEnd());
     52     EXPECT_EQ(0, breaker.breakBadness());
     53     EXPECT_EQ(11, breaker.current());
     54 }
     55 
     56 TEST_F(WordBreakerTest, softHyphen) {
     57     uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
     58     WordBreaker breaker;
     59     breaker.setLocale(icu::Locale::getEnglish());
     60     breaker.setText(buf, NELEM(buf));
     61     EXPECT_EQ(0, breaker.current());
     62     EXPECT_EQ(7, breaker.next());  // after "hel{SOFT HYPHEN}lo "
     63     EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
     64     EXPECT_EQ(6, breaker.wordEnd());
     65     EXPECT_EQ(0, breaker.breakBadness());
     66     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
     67     EXPECT_EQ(7, breaker.wordStart());  // "world"
     68     EXPECT_EQ(12, breaker.wordEnd());
     69     EXPECT_EQ(0, breaker.breakBadness());
     70 }
     71 
     72 TEST_F(WordBreakerTest, postfixAndPrefix) {
     73     uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US JP
     74     WordBreaker breaker;
     75     breaker.setLocale(icu::Locale::getEnglish());
     76     breaker.setText(buf, NELEM(buf));
     77     EXPECT_EQ(0, breaker.current());
     78 
     79     EXPECT_EQ(4, breaker.next());  // after CENT SIGN
     80     EXPECT_EQ(0, breaker.wordStart());  // "US"
     81     EXPECT_EQ(3, breaker.wordEnd());
     82 
     83     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
     84     EXPECT_EQ(4, breaker.wordStart());  // "JP"
     85     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
     86 }
     87 
     88 TEST_F(WordBreakerTest, MyanmarKinzi) {
     89     uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C};  // NGA, ASAT, VIRAMA, KA, UU
     90     WordBreaker breaker;
     91     icu::Locale burmese("my");
     92     breaker.setLocale(burmese);
     93     breaker.setText(buf, NELEM(buf));
     94     EXPECT_EQ(0, breaker.current());
     95 
     96     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
     97     EXPECT_EQ(0, breaker.wordStart());
     98     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
     99 }
    100 
    101 TEST_F(WordBreakerTest, zwjEmojiSequences) {
    102     uint16_t buf[] = {
    103         // man + zwj + heart + zwj + man
    104         UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
    105         // woman + zwj + heart + zwj + kiss mark + zwj + woman
    106         UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
    107         // eye + zwj + left speech bubble
    108         UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
    109         // CAT FACE + zwj + BUST IN SILHOUETTE
    110         UTF16(0x1F431), 0x200D, UTF16(0x1F464),
    111     };
    112     WordBreaker breaker;
    113     breaker.setLocale(icu::Locale::getEnglish());
    114     breaker.setText(buf, NELEM(buf));
    115     EXPECT_EQ(0, breaker.current());
    116     EXPECT_EQ(7, breaker.next());  // after man + zwj + heart + zwj + man
    117     EXPECT_EQ(0, breaker.wordStart());
    118     EXPECT_EQ(7, breaker.wordEnd());
    119     EXPECT_EQ(17, breaker.next());  // after woman + zwj + heart + zwj + woman
    120     EXPECT_EQ(7, breaker.wordStart());
    121     EXPECT_EQ(17, breaker.wordEnd());
    122     EXPECT_EQ(22, breaker.next());  // after eye + zwj + left speech bubble
    123     EXPECT_EQ(17, breaker.wordStart());
    124     EXPECT_EQ(22, breaker.wordEnd());
    125     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    126     EXPECT_EQ(22, breaker.wordStart());
    127     EXPECT_EQ(27, breaker.wordEnd());
    128 }
    129 
    130 TEST_F(WordBreakerTest, emojiWithModifier) {
    131     uint16_t buf[] = {
    132         UTF16(0x1F466), UTF16(0x1F3FB),  // boy + type 1-2 fitzpatrick modifier
    133         0x270C, 0xFE0F, UTF16(0x1F3FF)  // victory hand + emoji style + type 6 fitzpatrick modifier
    134     };
    135     WordBreaker breaker;
    136     breaker.setLocale(icu::Locale::getEnglish());
    137     breaker.setText(buf, NELEM(buf));
    138     EXPECT_EQ(0, breaker.current());
    139     EXPECT_EQ(4, breaker.next());  // after man + type 6 fitzpatrick modifier
    140     EXPECT_EQ(0, breaker.wordStart());
    141     EXPECT_EQ(4, breaker.wordEnd());
    142     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    143     EXPECT_EQ(4, breaker.wordStart());
    144     EXPECT_EQ(8, breaker.wordEnd());
    145 }
    146 
    147 TEST_F(WordBreakerTest, punct) {
    148     uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
    149         '!', '!'};
    150     WordBreaker breaker;
    151     breaker.setLocale(icu::Locale::getEnglish());
    152     breaker.setText(buf, NELEM(buf));
    153     EXPECT_EQ(0, breaker.current());
    154     EXPECT_EQ(9, breaker.next());  // after "hello, "
    155     EXPECT_EQ(2, breaker.wordStart());  // "hello"
    156     EXPECT_EQ(7, breaker.wordEnd());
    157     EXPECT_EQ(0, breaker.breakBadness());
    158     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    159     EXPECT_EQ(9, breaker.wordStart());  // "world"
    160     EXPECT_EQ(14, breaker.wordEnd());
    161     EXPECT_EQ(0, breaker.breakBadness());
    162 }
    163 
    164 TEST_F(WordBreakerTest, email) {
    165     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
    166         ' ', 'x'};
    167     WordBreaker breaker;
    168     breaker.setLocale(icu::Locale::getEnglish());
    169     breaker.setText(buf, NELEM(buf));
    170     EXPECT_EQ(0, breaker.current());
    171     EXPECT_EQ(11, breaker.next());  // after "foo@example"
    172     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    173     EXPECT_EQ(1, breaker.breakBadness());
    174     EXPECT_EQ(16, breaker.next());  // after ".com "
    175     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    176     EXPECT_EQ(0, breaker.breakBadness());
    177     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    178     EXPECT_EQ(16, breaker.wordStart());  // "x"
    179     EXPECT_EQ(17, breaker.wordEnd());
    180     EXPECT_EQ(0, breaker.breakBadness());
    181 }
    182 
    183 TEST_F(WordBreakerTest, mailto) {
    184     uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@',
    185         'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
    186     WordBreaker breaker;
    187     breaker.setLocale(icu::Locale::getEnglish());
    188     breaker.setText(buf, NELEM(buf));
    189     EXPECT_EQ(0, breaker.current());
    190     EXPECT_EQ(7, breaker.next());  // after "mailto:"
    191     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    192     EXPECT_EQ(1, breaker.breakBadness());
    193     EXPECT_EQ(18, breaker.next());  // after "foo@example"
    194     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    195     EXPECT_EQ(1, breaker.breakBadness());
    196     EXPECT_EQ(23, breaker.next());  // after ".com "
    197     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    198     EXPECT_EQ(0, breaker.breakBadness());
    199     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    200     EXPECT_EQ(23, breaker.wordStart());  // "x"
    201     EXPECT_EQ(24, breaker.wordEnd());
    202     EXPECT_EQ(0, breaker.breakBadness());
    203 }
    204 
    205 // The current logic always places a line break after a detected email address or URL
    206 // and an immediately following non-ASCII character.
    207 TEST_F(WordBreakerTest, emailNonAscii) {
    208     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
    209         0x4E00};
    210     WordBreaker breaker;
    211     breaker.setLocale(icu::Locale::getEnglish());
    212     breaker.setText(buf, NELEM(buf));
    213     EXPECT_EQ(0, breaker.current());
    214     EXPECT_EQ(11, breaker.next());  // after "foo@example"
    215     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    216     EXPECT_EQ(1, breaker.breakBadness());
    217     EXPECT_EQ(15, breaker.next());  // after ".com"
    218     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    219     EXPECT_EQ(0, breaker.breakBadness());
    220     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    221     EXPECT_EQ(15, breaker.wordStart());  // ""
    222     EXPECT_EQ(16, breaker.wordEnd());
    223     EXPECT_EQ(0, breaker.breakBadness());
    224 }
    225 
    226 TEST_F(WordBreakerTest, emailCombining) {
    227     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
    228         0x0303, ' ', 'x'};
    229     WordBreaker breaker;
    230     breaker.setLocale(icu::Locale::getEnglish());
    231     breaker.setText(buf, NELEM(buf));
    232     EXPECT_EQ(0, breaker.current());
    233     EXPECT_EQ(11, breaker.next());  // after "foo@example"
    234     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    235     EXPECT_EQ(1, breaker.breakBadness());
    236     EXPECT_EQ(17, breaker.next());  // after ".com "
    237     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    238     EXPECT_EQ(0, breaker.breakBadness());
    239     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    240     EXPECT_EQ(17, breaker.wordStart());  // "x"
    241     EXPECT_EQ(18, breaker.wordEnd());
    242     EXPECT_EQ(0, breaker.breakBadness());
    243 }
    244 
    245 TEST_F(WordBreakerTest, lonelyAt) {
    246     uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
    247     WordBreaker breaker;
    248     breaker.setLocale(icu::Locale::getEnglish());
    249     breaker.setText(buf, NELEM(buf));
    250     EXPECT_EQ(0, breaker.current());
    251     EXPECT_EQ(2, breaker.next());  // after "a "
    252     EXPECT_EQ(0, breaker.wordStart());  // "a"
    253     EXPECT_EQ(1, breaker.wordEnd());
    254     EXPECT_EQ(0, breaker.breakBadness());
    255     EXPECT_EQ(4, breaker.next());  // after "@ "
    256     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    257     EXPECT_EQ(0, breaker.breakBadness());
    258     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    259     EXPECT_EQ(4, breaker.wordStart());  // "b"
    260     EXPECT_EQ(5, breaker.wordEnd());
    261     EXPECT_EQ(0, breaker.breakBadness());
    262 }
    263 
    264 TEST_F(WordBreakerTest, url) {
    265     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
    266         '.', 'c', 'o', 'm', ' ', 'x'};
    267     WordBreaker breaker;
    268     breaker.setLocale(icu::Locale::getEnglish());
    269     breaker.setText(buf, NELEM(buf));
    270     EXPECT_EQ(0, breaker.current());
    271     EXPECT_EQ(5, breaker.next());  // after "http:"
    272     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    273     EXPECT_EQ(1, breaker.breakBadness());
    274     EXPECT_EQ(7, breaker.next());  // after "//"
    275     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    276     EXPECT_EQ(1, breaker.breakBadness());
    277     EXPECT_EQ(14, breaker.next());  // after "example"
    278     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    279     EXPECT_EQ(1, breaker.breakBadness());
    280     EXPECT_EQ(19, breaker.next());  // after ".com "
    281     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    282     EXPECT_EQ(0, breaker.breakBadness());
    283     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    284     EXPECT_EQ(19, breaker.wordStart());  // "x"
    285     EXPECT_EQ(20, breaker.wordEnd());
    286     EXPECT_EQ(0, breaker.breakBadness());
    287 }
    288 
    289 // Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
    290 TEST_F(WordBreakerTest, urlBreakChars) {
    291     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd',
    292         '-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
    293     WordBreaker breaker;
    294     breaker.setLocale(icu::Locale::getEnglish());
    295     breaker.setText(buf, NELEM(buf));
    296     EXPECT_EQ(0, breaker.current());
    297     EXPECT_EQ(5, breaker.next());  // after "http:"
    298     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    299     EXPECT_EQ(1, breaker.breakBadness());
    300     EXPECT_EQ(7, breaker.next());  // after "//"
    301     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    302     EXPECT_EQ(1, breaker.breakBadness());
    303     EXPECT_EQ(8, breaker.next());  // after "a"
    304     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    305     EXPECT_EQ(1, breaker.breakBadness());
    306     EXPECT_EQ(10, breaker.next());  // after ".b"
    307     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    308     EXPECT_EQ(1, breaker.breakBadness());
    309     EXPECT_EQ(11, breaker.next());  // after "/"
    310     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    311     EXPECT_EQ(1, breaker.breakBadness());
    312     EXPECT_EQ(13, breaker.next());  // after "~c"
    313     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    314     EXPECT_EQ(1, breaker.breakBadness());
    315     EXPECT_EQ(15, breaker.next());  // after ",d"
    316     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    317     EXPECT_EQ(1, breaker.breakBadness());
    318     EXPECT_EQ(17, breaker.next());  // after "-e"
    319     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    320     EXPECT_EQ(1, breaker.breakBadness());
    321     EXPECT_EQ(19, breaker.next());  // after "?f"
    322     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    323     EXPECT_EQ(1, breaker.breakBadness());
    324     EXPECT_EQ(20, breaker.next());  // after "="
    325     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    326     EXPECT_EQ(1, breaker.breakBadness());
    327     EXPECT_EQ(21, breaker.next());  // after "g"
    328     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    329     EXPECT_EQ(1, breaker.breakBadness());
    330     EXPECT_EQ(22, breaker.next());  // after "&"
    331     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    332     EXPECT_EQ(1, breaker.breakBadness());
    333     EXPECT_EQ(23, breaker.next());  // after "h"
    334     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    335     EXPECT_EQ(1, breaker.breakBadness());
    336     EXPECT_EQ(25, breaker.next());  // after "#i"
    337     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    338     EXPECT_EQ(1, breaker.breakBadness());
    339     EXPECT_EQ(27, breaker.next());  // after "%j"
    340     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    341     EXPECT_EQ(1, breaker.breakBadness());
    342     EXPECT_EQ(29, breaker.next());  // after "_k"
    343     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    344     EXPECT_EQ(1, breaker.breakBadness());
    345     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    346     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    347     EXPECT_EQ(0, breaker.breakBadness());
    348 }
    349 
    350 TEST_F(WordBreakerTest, urlNoHyphenBreak) {
    351     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
    352     WordBreaker breaker;
    353     breaker.setLocale(icu::Locale::getEnglish());
    354     breaker.setText(buf, NELEM(buf));
    355     EXPECT_EQ(0, breaker.current());
    356     EXPECT_EQ(5, breaker.next());  // after "http:"
    357     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    358     EXPECT_EQ(7, breaker.next());  // after "//"
    359     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    360     EXPECT_EQ(8, breaker.next());  // after "a"
    361     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    362     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    363     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    364 }
    365 
    366 TEST_F(WordBreakerTest, urlEndsWithSlash) {
    367     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
    368     WordBreaker breaker;
    369     breaker.setLocale(icu::Locale::getEnglish());
    370     breaker.setText(buf, NELEM(buf));
    371     EXPECT_EQ(0, breaker.current());
    372     EXPECT_EQ(5, breaker.next());  // after "http:"
    373     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    374     EXPECT_EQ(7, breaker.next());  // after "//"
    375     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    376     EXPECT_EQ(8, breaker.next());  // after "a"
    377     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    378     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    379     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    380 }
    381 
    382 TEST_F(WordBreakerTest, emailStartsWithSlash) {
    383     uint16_t buf[] = {'/', 'a', '@', 'b'};
    384     WordBreaker breaker;
    385     breaker.setLocale(icu::Locale::getEnglish());
    386     breaker.setText(buf, NELEM(buf));
    387     EXPECT_EQ(0, breaker.current());
    388     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
    389     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
    390 }
    391