Home | History | Annotate | Download | only in unittest
      1 /*
      2  * Copyright (C) 2017 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #include "minikin/Hyphenator.h"
     18 
     19 #include <gtest/gtest.h>
     20 
     21 #include "FileUtils.h"
     22 
     23 #ifndef NELEM
     24 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
     25 #endif
     26 
     27 namespace minikin {
     28 
     29 const char* usHyph = "/system/usr/hyphen-data/hyph-en-us.hyb";
     30 const char* malayalamHyph = "/system/usr/hyphen-data/hyph-ml.hyb";
     31 
     32 const uint16_t HYPHEN_MINUS = 0x002D;
     33 const uint16_t SOFT_HYPHEN = 0x00AD;
     34 const uint16_t MIDDLE_DOT = 0x00B7;
     35 const uint16_t GREEK_LOWER_ALPHA = 0x03B1;
     36 const uint16_t ARMENIAN_AYB = 0x0531;
     37 const uint16_t HEBREW_ALEF = 0x05D0;
     38 const uint16_t ARABIC_ALEF = 0x0627;
     39 const uint16_t ARABIC_BEH = 0x0628;
     40 const uint16_t ARABIC_ZWARAKAY = 0x0659;
     41 const uint16_t MALAYALAM_KA = 0x0D15;
     42 const uint16_t UCAS_E = 0x1401;
     43 const uint16_t HYPHEN = 0x2010;
     44 const uint16_t EN_DASH = 0x2013;
     45 
     46 // Simple test for US English. This tests "table", which happens to be the in the exceptions list.
     47 TEST(HyphenatorTest, usEnglishAutomaticHyphenation) {
     48     std::vector<uint8_t> patternData = readWholeFile(usHyph);
     49     Hyphenator* hyphenator = Hyphenator::loadBinary(patternData.data(), 2, 3, "en");
     50     const uint16_t word[] = {'t', 'a', 'b', 'l', 'e'};
     51     std::vector<HyphenationType> result;
     52     hyphenator->hyphenate(word, &result);
     53     EXPECT_EQ((size_t)5, result.size());
     54     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
     55     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
     56     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
     57     EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]);
     58     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
     59 }
     60 
     61 // Catalan ll should break as l-/l
     62 TEST(HyphenatorTest, catalanMiddleDot) {
     63     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "ca");
     64     const uint16_t word[] = {'l', 'l', MIDDLE_DOT, 'l', 'l'};
     65     std::vector<HyphenationType> result;
     66     hyphenator->hyphenate(word, &result);
     67     EXPECT_EQ((size_t)5, result.size());
     68     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
     69     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
     70     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
     71     EXPECT_EQ(HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN, result[3]);
     72     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
     73 }
     74 
     75 // Catalan ll should not break if the word is too short.
     76 TEST(HyphenatorTest, catalanMiddleDotShortWord) {
     77     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "ca");
     78     const uint16_t word[] = {'l', MIDDLE_DOT, 'l'};
     79     std::vector<HyphenationType> result;
     80     hyphenator->hyphenate(word, &result);
     81     EXPECT_EQ((size_t)3, result.size());
     82     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
     83     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
     84     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
     85 }
     86 
     87 // If we break on a hyphen in Polish, the hyphen should be repeated on the next line.
     88 TEST(HyphenatorTest, polishHyphen) {
     89     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl");
     90     const uint16_t word[] = {'x', HYPHEN, 'y'};
     91     std::vector<HyphenationType> result;
     92     hyphenator->hyphenate(word, &result);
     93     EXPECT_EQ((size_t)3, result.size());
     94     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
     95     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
     96     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE, result[2]);
     97 }
     98 
     99 // If the language is Polish but the script is not Latin, don't use Polish rules for hyphenation.
    100 TEST(HyphenatorTest, polishHyphenButNonLatinWord) {
    101     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl");
    102     const uint16_t word[] = {GREEK_LOWER_ALPHA, HYPHEN, GREEK_LOWER_ALPHA};
    103     std::vector<HyphenationType> result;
    104     hyphenator->hyphenate(word, &result);
    105     EXPECT_EQ((size_t)3, result.size());
    106     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    107     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    108     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
    109 }
    110 
    111 // Polish en dash doesn't repeat on next line (as far as we know), but just provides a break
    112 // opportunity.
    113 TEST(HyphenatorTest, polishEnDash) {
    114     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl");
    115     const uint16_t word[] = {'x', EN_DASH, 'y'};
    116     std::vector<HyphenationType> result;
    117     hyphenator->hyphenate(word, &result);
    118     EXPECT_EQ((size_t)3, result.size());
    119     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    120     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    121     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
    122 }
    123 
    124 // If we break on a hyphen in Slovenian, the hyphen should be repeated on the next line. (Same as
    125 // Polish.)
    126 TEST(HyphenatorTest, slovenianHyphen) {
    127     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "sl");
    128     const uint16_t word[] = {'x', HYPHEN, 'y'};
    129     std::vector<HyphenationType> result;
    130     hyphenator->hyphenate(word, &result);
    131     EXPECT_EQ((size_t)3, result.size());
    132     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    133     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    134     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE, result[2]);
    135 }
    136 
    137 // In Latin script text, soft hyphens should insert a visible hyphen if broken at.
    138 TEST(HyphenatorTest, latinSoftHyphen) {
    139     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    140     const uint16_t word[] = {'x', SOFT_HYPHEN, 'y'};
    141     std::vector<HyphenationType> result;
    142     hyphenator->hyphenate(word, &result);
    143     EXPECT_EQ((size_t)3, result.size());
    144     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    145     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    146     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
    147 }
    148 
    149 // Soft hyphens at the beginning of a word are not useful in linebreaking.
    150 TEST(HyphenatorTest, latinSoftHyphenStartingTheWord) {
    151     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    152     const uint16_t word[] = {SOFT_HYPHEN, 'y'};
    153     std::vector<HyphenationType> result;
    154     hyphenator->hyphenate(word, &result);
    155     EXPECT_EQ((size_t)2, result.size());
    156     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    157     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    158 }
    159 
    160 // In Malayalam script text, soft hyphens should not insert a visible hyphen if broken at.
    161 TEST(HyphenatorTest, malayalamSoftHyphen) {
    162     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    163     const uint16_t word[] = {MALAYALAM_KA, SOFT_HYPHEN, MALAYALAM_KA};
    164     std::vector<HyphenationType> result;
    165     hyphenator->hyphenate(word, &result);
    166     EXPECT_EQ((size_t)3, result.size());
    167     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    168     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    169     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
    170 }
    171 
    172 // In automatically hyphenated Malayalam script text, we should not insert a visible hyphen.
    173 TEST(HyphenatorTest, malayalamAutomaticHyphenation) {
    174     std::vector<uint8_t> patternData = readWholeFile(malayalamHyph);
    175     Hyphenator* hyphenator = Hyphenator::loadBinary(patternData.data(), 2, 2, "en");
    176     const uint16_t word[] = {MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA};
    177     std::vector<HyphenationType> result;
    178     hyphenator->hyphenate(word, &result);
    179     EXPECT_EQ((size_t)5, result.size());
    180     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    181     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    182     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
    183     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[3]);
    184     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
    185 }
    186 
    187 // In Armenian script text, soft hyphens should insert an Armenian hyphen if broken at.
    188 TEST(HyphenatorTest, aremenianSoftHyphen) {
    189     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    190     const uint16_t word[] = {ARMENIAN_AYB, SOFT_HYPHEN, ARMENIAN_AYB};
    191     std::vector<HyphenationType> result;
    192     hyphenator->hyphenate(word, &result);
    193     EXPECT_EQ((size_t)3, result.size());
    194     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    195     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    196     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN, result[2]);
    197 }
    198 
    199 // In Hebrew script text, soft hyphens should insert a normal hyphen if broken at, for now.
    200 // We may need to change this to maqaf later.
    201 TEST(HyphenatorTest, hebrewSoftHyphen) {
    202     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    203     const uint16_t word[] = {HEBREW_ALEF, SOFT_HYPHEN, HEBREW_ALEF};
    204     std::vector<HyphenationType> result;
    205     hyphenator->hyphenate(word, &result);
    206     EXPECT_EQ((size_t)3, result.size());
    207     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    208     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    209     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
    210 }
    211 
    212 // Soft hyphen between two Arabic letters that join should keep the joining
    213 // behavior when broken across lines.
    214 TEST(HyphenatorTest, arabicSoftHyphenConnecting) {
    215     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    216     const uint16_t word[] = {ARABIC_BEH, SOFT_HYPHEN, ARABIC_BEH};
    217     std::vector<HyphenationType> result;
    218     hyphenator->hyphenate(word, &result);
    219     EXPECT_EQ((size_t)3, result.size());
    220     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    221     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    222     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[2]);
    223 }
    224 
    225 // Arabic letters may be joining on one side, but if it's the wrong side, we
    226 // should use the normal hyphen.
    227 TEST(HyphenatorTest, arabicSoftHyphenNonConnecting) {
    228     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    229     const uint16_t word[] = {ARABIC_ALEF, SOFT_HYPHEN, ARABIC_BEH};
    230     std::vector<HyphenationType> result;
    231     hyphenator->hyphenate(word, &result);
    232     EXPECT_EQ((size_t)3, result.size());
    233     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    234     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    235     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
    236 }
    237 
    238 // Skip transparent characters until you find a non-transparent one.
    239 TEST(HyphenatorTest, arabicSoftHyphenSkipTransparents) {
    240     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    241     const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH};
    242     std::vector<HyphenationType> result;
    243     hyphenator->hyphenate(word, &result);
    244     EXPECT_EQ((size_t)5, result.size());
    245     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    246     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    247     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
    248     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[3]);
    249     EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]);
    250 }
    251 
    252 // Skip transparent characters until you find a non-transparent one. If we get to one end without
    253 // finding anything, we are still non-joining.
    254 TEST(HyphenatorTest, arabicSoftHyphenTransparentsAtEnd) {
    255     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    256     const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY};
    257     std::vector<HyphenationType> result;
    258     hyphenator->hyphenate(word, &result);
    259     EXPECT_EQ((size_t)4, result.size());
    260     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    261     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    262     EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]);
    263     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[3]);
    264 }
    265 
    266 // Skip transparent characters until you find a non-transparent one. If we get to one end without
    267 // finding anything, we are still non-joining.
    268 TEST(HyphenatorTest, arabicSoftHyphenTransparentsAtStart) {
    269     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    270     const uint16_t word[] = {ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH};
    271     std::vector<HyphenationType> result;
    272     hyphenator->hyphenate(word, &result);
    273     EXPECT_EQ((size_t)4, result.size());
    274     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    275     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    276     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]);
    277     EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]);
    278 }
    279 
    280 // In Unified Canadian Aboriginal script (UCAS) text, soft hyphens should insert a UCAS hyphen.
    281 TEST(HyphenatorTest, ucasSoftHyphen) {
    282     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    283     const uint16_t word[] = {UCAS_E, SOFT_HYPHEN, UCAS_E};
    284     std::vector<HyphenationType> result;
    285     hyphenator->hyphenate(word, &result);
    286     EXPECT_EQ((size_t)3, result.size());
    287     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    288     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    289     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]);
    290 }
    291 
    292 // Presently, soft hyphen looks at the character after it to determine hyphenation type. This is a
    293 // little arbitrary, but let's test it anyway.
    294 TEST(HyphenatorTest, mixedScriptSoftHyphen) {
    295     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    296     const uint16_t word[] = {'a', SOFT_HYPHEN, UCAS_E};
    297     std::vector<HyphenationType> result;
    298     hyphenator->hyphenate(word, &result);
    299     EXPECT_EQ((size_t)3, result.size());
    300     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    301     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    302     EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]);
    303 }
    304 
    305 // Hard hyphens provide a breaking opportunity with nothing extra inserted.
    306 TEST(HyphenatorTest, hardHyphen) {
    307     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    308     const uint16_t word[] = {'x', HYPHEN, 'y'};
    309     std::vector<HyphenationType> result;
    310     hyphenator->hyphenate(word, &result);
    311     EXPECT_EQ((size_t)3, result.size());
    312     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    313     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    314     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
    315 }
    316 
    317 // Hyphen-minuses also provide a breaking opportunity with nothing extra inserted.
    318 TEST(HyphenatorTest, hyphenMinus) {
    319     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    320     const uint16_t word[] = {'x', HYPHEN_MINUS, 'y'};
    321     std::vector<HyphenationType> result;
    322     hyphenator->hyphenate(word, &result);
    323     EXPECT_EQ((size_t)3, result.size());
    324     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    325     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    326     EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]);
    327 }
    328 
    329 // If the word starts with a hard hyphen or hyphen-minus, it doesn't make sense to break
    330 // it at that point.
    331 TEST(HyphenatorTest, startingHyphenMinus) {
    332     Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en");
    333     const uint16_t word[] = {HYPHEN_MINUS, 'y'};
    334     std::vector<HyphenationType> result;
    335     hyphenator->hyphenate(word, &result);
    336     EXPECT_EQ((size_t)2, result.size());
    337     EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]);
    338     EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]);
    339 }
    340 
    341 }  // namespace minikin
    342