Home | History | Annotate | Download | only in tests
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include <gtest/gtest.h>
     18 #include <UnicodeUtils.h>
     19 #include <minikin/GraphemeBreak.h>
     20 
     21 using namespace android;
     22 
     23 bool IsBreak(const char* src) {
     24     const size_t BUF_SIZE = 256;
     25     uint16_t buf[BUF_SIZE];
     26     size_t offset;
     27     size_t size;
     28     ParseUnicode(buf, BUF_SIZE, src, &size, &offset);
     29     return GraphemeBreak::isGraphemeBreak(buf, 0, size, offset);
     30 }
     31 
     32 TEST(GraphemeBreak, utf16) {
     33     EXPECT_FALSE(IsBreak("U+D83C | U+DC31"));  // emoji, U+1F431
     34 
     35     // tests for invalid UTF-16
     36     EXPECT_TRUE(IsBreak("U+D800 | U+D800"));  // two leading surrogates
     37     EXPECT_TRUE(IsBreak("U+DC00 | U+DC00"));  // two trailing surrogates
     38     EXPECT_TRUE(IsBreak("'a' | U+D800"));  // lonely leading surrogate
     39     EXPECT_TRUE(IsBreak("U+DC00 | 'a'"));  // lonely trailing surrogate
     40     EXPECT_TRUE(IsBreak("U+D800 | 'a'"));  // leading surrogate followed by non-surrogate
     41     EXPECT_TRUE(IsBreak("'a' | U+DC00"));  // non-surrogate followed by trailing surrogate
     42 }
     43 
     44 TEST(GraphemeBreak, rules) {
     45     // Rule GB1, sot ; Rule GB2,  eot
     46     EXPECT_TRUE(IsBreak("| 'a'"));
     47     EXPECT_TRUE(IsBreak("'a' |"));
     48 
     49     // Rule GB3, CR x LF
     50     EXPECT_FALSE(IsBreak("U+000D | U+000A"));  // CR x LF
     51 
     52     // Rule GB4, (Control | CR | LF) 
     53     EXPECT_TRUE(IsBreak("'a' | U+2028"));  // Line separator
     54     EXPECT_TRUE(IsBreak("'a' | U+000D"));  // LF
     55     EXPECT_TRUE(IsBreak("'a' | U+000A"));  // CR
     56 
     57     // Rule GB5,  (Control | CR | LF)
     58     EXPECT_TRUE(IsBreak("U+2028 | 'a'"));  // Line separator
     59     EXPECT_TRUE(IsBreak("U+000D | 'a'"));  // LF
     60     EXPECT_TRUE(IsBreak("U+000A | 'a'"));  // CR
     61 
     62     // Rule GB6, L x ( L | V | LV | LVT )
     63     EXPECT_FALSE(IsBreak("U+1100 | U+1100"));  // L x L
     64     EXPECT_FALSE(IsBreak("U+1100 | U+1161"));  // L x V
     65     EXPECT_FALSE(IsBreak("U+1100 | U+AC00"));  // L x LV
     66     EXPECT_FALSE(IsBreak("U+1100 | U+AC01"));  // L x LVT
     67 
     68     // Rule GB7, ( LV | V ) x ( V | T )
     69     EXPECT_FALSE(IsBreak("U+AC00 | U+1161"));  // LV x V
     70     EXPECT_FALSE(IsBreak("U+1161 | U+1161"));  // V x V
     71     EXPECT_FALSE(IsBreak("U+AC00 | U+11A8"));  // LV x T
     72     EXPECT_FALSE(IsBreak("U+1161 | U+11A8"));  // V x T
     73 
     74     // Rule GB8, ( LVT | T ) x T
     75     EXPECT_FALSE(IsBreak("U+AC01 | U+11A8"));  // LVT x T
     76     EXPECT_FALSE(IsBreak("U+11A8 | U+11A8"));  // T x T
     77 
     78     // Other hangul pairs not counted above _are_ breaks (GB10)
     79     EXPECT_TRUE(IsBreak("U+AC00 | U+1100"));  // LV x L
     80     EXPECT_TRUE(IsBreak("U+AC01 | U+1100"));  // LVT x L
     81     EXPECT_TRUE(IsBreak("U+11A8 | U+1100"));  // T x L
     82     EXPECT_TRUE(IsBreak("U+11A8 | U+AC00"));  // T x LV
     83     EXPECT_TRUE(IsBreak("U+11A8 | U+AC01"));  // T x LVT
     84 
     85     // Rule GB8a, Regional_Indicator x Regional_Indicator
     86     EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8"));
     87     EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
     88     EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
     89     EXPECT_FALSE(IsBreak("U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag)
     90 
     91     EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA"));  // Regional indicator pair (flag)
     92     EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA"));  // Regional indicator pair (flag)
     93 
     94     EXPECT_TRUE(IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA"));  // Regional indicator pair (flag)
     95     EXPECT_FALSE(IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA"));  // Regional indicator pair (flag)
     96 
     97     EXPECT_TRUE(
     98             IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8"));  // Regional indicator pair (flag)
     99     EXPECT_FALSE(
    100             IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8"));  // Regional indicator pair (flag)
    101     EXPECT_FALSE(
    102             IsBreak("'a' U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8"));  // Regional indicator pair (flag)
    103 
    104     // Rule GB9, x Extend
    105     EXPECT_FALSE(IsBreak("'a' | U+0301"));  // combining accent
    106     // Rule GB9a, x SpacingMark
    107     EXPECT_FALSE(IsBreak("U+0915 | U+093E"));  // KA, AA (spacing mark)
    108     // Rule GB9b, Prepend x
    109     // see tailoring test for prepend, as current ICU doesn't have any characters in the class
    110 
    111     // Rule GB10, Any  Any
    112     EXPECT_TRUE(IsBreak("'a' | 'b'"));
    113     EXPECT_TRUE(IsBreak("'f' | 'i'"));  // probable ligature
    114     EXPECT_TRUE(IsBreak("U+0644 | U+0627"));  // probable ligature, lam + alef
    115     EXPECT_TRUE(IsBreak("U+4E00 | U+4E00"));  // CJK ideographs
    116     EXPECT_TRUE(IsBreak("'a' | U+1F1FA U+1F1F8"));  // Regional indicator pair (flag)
    117     EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | 'a'"));  // Regional indicator pair (flag)
    118 }
    119 
    120 TEST(GraphemeBreak, tailoring) {
    121     // control characters that we interpret as "extend"
    122     EXPECT_FALSE(IsBreak("'a' | U+00AD"));  // soft hyphen
    123     EXPECT_FALSE(IsBreak("'a' | U+200B"));  // zwsp
    124     EXPECT_FALSE(IsBreak("'a' | U+200E"));  // lrm
    125     EXPECT_FALSE(IsBreak("'a' | U+202A"));  // lre
    126     EXPECT_FALSE(IsBreak("'a' | U+E0041"));  // tag character
    127 
    128     // UTC-approved characters for the Prepend class
    129     EXPECT_FALSE(IsBreak("U+06DD | U+0661"));  // arabic subtending mark + digit one
    130 
    131     EXPECT_TRUE(IsBreak("U+0E01 | U+0E33"));  // Thai sara am
    132 
    133     // virama is not a grapheme break, but "pure killer" is
    134     EXPECT_FALSE(IsBreak("U+0915 | U+094D U+0915"));  // Devanagari ka+virama+ka
    135     EXPECT_FALSE(IsBreak("U+0915 U+094D | U+0915"));  // Devanagari ka+virama+ka
    136     EXPECT_FALSE(IsBreak("U+0E01 | U+0E3A U+0E01"));  // thai phinthu = pure killer
    137     EXPECT_TRUE(IsBreak("U+0E01 U+0E3A | U+0E01"));  // thai phinthu = pure killer
    138 
    139     // suppress grapheme breaks in zwj emoji sequences, see
    140     // http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html
    141     EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2764 U+FE0F U+200D U+1F48B U+200D U+1F468"));
    142     EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D | U+1F48B U+200D U+1F468"));
    143     EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D U+1F48B U+200D | U+1F468"));
    144     EXPECT_FALSE(IsBreak("U+1F468 U+200D | U+1F469 U+200D U+1F466"));
    145     EXPECT_FALSE(IsBreak("U+1F468 U+200D U+1F469 U+200D | U+1F466"));
    146     EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F469 U+200D U+1F467 U+200D U+1F466"));
    147     EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D | U+1F467 U+200D U+1F466"));
    148     EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D U+1F467 U+200D | U+1F466"));
    149     EXPECT_FALSE(IsBreak("U+1F441 U+200D | U+1F5E8"));
    150 
    151     // Do not break before and after zwj with all kind of emoji characters.
    152     EXPECT_FALSE(IsBreak("U+1F431 | U+200D U+1F464"));
    153     EXPECT_FALSE(IsBreak("U+1F431 U+200D | U+1F464"));
    154 
    155     // ARABIC LETTER BEH + ZWJ + heart, not a zwj emoji sequence, so we preserve the break
    156     EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764"));
    157 }
    158 
    159 TEST(GraphemeBreak, emojiModifiers) {
    160     EXPECT_FALSE(IsBreak("U+261D | U+1F3FB"));  // white up pointing index + modifier
    161     EXPECT_FALSE(IsBreak("U+270C | U+1F3FB"));  // victory hand + modifier
    162     EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FB"));  // boy + modifier
    163     EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FC"));  // boy + modifier
    164     EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FD"));  // boy + modifier
    165     EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FE"));  // boy + modifier
    166     EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FF"));  // boy + modifier
    167     EXPECT_FALSE(IsBreak("U+1F918 | U+1F3FF"));  // sign of the horns + modifier
    168     EXPECT_FALSE(IsBreak("U+1F933 | U+1F3FF"));  // selfie (Unicode 9) + modifier
    169 
    170     // adding emoji style variation selector doesn't affect grapheme cluster
    171     EXPECT_TRUE(IsBreak("U+270C U+FE0E | U+1F3FB"));  // victory hand + text style + modifier
    172     EXPECT_FALSE(IsBreak("U+270C U+FE0F | U+1F3FB"));  // heart + emoji style + modifier
    173 
    174     // heart is not an emoji base
    175     EXPECT_TRUE(IsBreak("U+2764 | U+1F3FB"));  // heart + modifier
    176     EXPECT_TRUE(IsBreak("U+2764 U+FE0E | U+1F3FB"));  // heart + emoji style + modifier
    177     EXPECT_TRUE(IsBreak("U+2764 U+FE0F | U+1F3FB"));  // heart + emoji style + modifier
    178     EXPECT_TRUE(IsBreak("U+1F3FB | U+1F3FB"));  // modifier + modifier
    179 
    180     // rat is not an emoji modifer
    181     EXPECT_TRUE(IsBreak("U+1F466 | U+1F400"));  // boy + rat
    182 }
    183 
    184 TEST(GraphemeBreak, offsets) {
    185     uint16_t string[] = { 0x0041, 0x06DD, 0x0045, 0x0301, 0x0049, 0x0301 };
    186     EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 2));
    187     EXPECT_FALSE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 3));
    188     EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 4));
    189     EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 5));
    190 }
    191