1 /* 2 * Copyright (C) 2017 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "minikin/Hyphenator.h" 18 19 #include <gtest/gtest.h> 20 21 #include "FileUtils.h" 22 23 #ifndef NELEM 24 #define NELEM(x) ((sizeof(x) / sizeof((x)[0]))) 25 #endif 26 27 namespace minikin { 28 29 const char* usHyph = "/system/usr/hyphen-data/hyph-en-us.hyb"; 30 const char* malayalamHyph = "/system/usr/hyphen-data/hyph-ml.hyb"; 31 32 const uint16_t HYPHEN_MINUS = 0x002D; 33 const uint16_t SOFT_HYPHEN = 0x00AD; 34 const uint16_t MIDDLE_DOT = 0x00B7; 35 const uint16_t GREEK_LOWER_ALPHA = 0x03B1; 36 const uint16_t ARMENIAN_AYB = 0x0531; 37 const uint16_t HEBREW_ALEF = 0x05D0; 38 const uint16_t ARABIC_ALEF = 0x0627; 39 const uint16_t ARABIC_BEH = 0x0628; 40 const uint16_t ARABIC_ZWARAKAY = 0x0659; 41 const uint16_t MALAYALAM_KA = 0x0D15; 42 const uint16_t UCAS_E = 0x1401; 43 const uint16_t HYPHEN = 0x2010; 44 const uint16_t EN_DASH = 0x2013; 45 46 // Simple test for US English. This tests "table", which happens to be the in the exceptions list. 47 TEST(HyphenatorTest, usEnglishAutomaticHyphenation) { 48 std::vector<uint8_t> patternData = readWholeFile(usHyph); 49 Hyphenator* hyphenator = Hyphenator::loadBinary(patternData.data(), 2, 3, "en"); 50 const uint16_t word[] = {'t', 'a', 'b', 'l', 'e'}; 51 std::vector<HyphenationType> result; 52 hyphenator->hyphenate(word, &result); 53 EXPECT_EQ((size_t)5, result.size()); 54 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 55 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 56 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]); 57 EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]); 58 EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]); 59 } 60 61 // Catalan ll should break as l-/l 62 TEST(HyphenatorTest, catalanMiddleDot) { 63 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "ca"); 64 const uint16_t word[] = {'l', 'l', MIDDLE_DOT, 'l', 'l'}; 65 std::vector<HyphenationType> result; 66 hyphenator->hyphenate(word, &result); 67 EXPECT_EQ((size_t)5, result.size()); 68 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 69 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 70 EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]); 71 EXPECT_EQ(HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN, result[3]); 72 EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]); 73 } 74 75 // Catalan ll should not break if the word is too short. 76 TEST(HyphenatorTest, catalanMiddleDotShortWord) { 77 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "ca"); 78 const uint16_t word[] = {'l', MIDDLE_DOT, 'l'}; 79 std::vector<HyphenationType> result; 80 hyphenator->hyphenate(word, &result); 81 EXPECT_EQ((size_t)3, result.size()); 82 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 83 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 84 EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]); 85 } 86 87 // If we break on a hyphen in Polish, the hyphen should be repeated on the next line. 88 TEST(HyphenatorTest, polishHyphen) { 89 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl"); 90 const uint16_t word[] = {'x', HYPHEN, 'y'}; 91 std::vector<HyphenationType> result; 92 hyphenator->hyphenate(word, &result); 93 EXPECT_EQ((size_t)3, result.size()); 94 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 95 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 96 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE, result[2]); 97 } 98 99 // If the language is Polish but the script is not Latin, don't use Polish rules for hyphenation. 100 TEST(HyphenatorTest, polishHyphenButNonLatinWord) { 101 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl"); 102 const uint16_t word[] = {GREEK_LOWER_ALPHA, HYPHEN, GREEK_LOWER_ALPHA}; 103 std::vector<HyphenationType> result; 104 hyphenator->hyphenate(word, &result); 105 EXPECT_EQ((size_t)3, result.size()); 106 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 107 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 108 EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]); 109 } 110 111 // Polish en dash doesn't repeat on next line (as far as we know), but just provides a break 112 // opportunity. 113 TEST(HyphenatorTest, polishEnDash) { 114 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "pl"); 115 const uint16_t word[] = {'x', EN_DASH, 'y'}; 116 std::vector<HyphenationType> result; 117 hyphenator->hyphenate(word, &result); 118 EXPECT_EQ((size_t)3, result.size()); 119 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 120 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 121 EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]); 122 } 123 124 // If we break on a hyphen in Slovenian, the hyphen should be repeated on the next line. (Same as 125 // Polish.) 126 TEST(HyphenatorTest, slovenianHyphen) { 127 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "sl"); 128 const uint16_t word[] = {'x', HYPHEN, 'y'}; 129 std::vector<HyphenationType> result; 130 hyphenator->hyphenate(word, &result); 131 EXPECT_EQ((size_t)3, result.size()); 132 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 133 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 134 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE, result[2]); 135 } 136 137 // In Latin script text, soft hyphens should insert a visible hyphen if broken at. 138 TEST(HyphenatorTest, latinSoftHyphen) { 139 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 140 const uint16_t word[] = {'x', SOFT_HYPHEN, 'y'}; 141 std::vector<HyphenationType> result; 142 hyphenator->hyphenate(word, &result); 143 EXPECT_EQ((size_t)3, result.size()); 144 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 145 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 146 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]); 147 } 148 149 // Soft hyphens at the beginning of a word are not useful in linebreaking. 150 TEST(HyphenatorTest, latinSoftHyphenStartingTheWord) { 151 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 152 const uint16_t word[] = {SOFT_HYPHEN, 'y'}; 153 std::vector<HyphenationType> result; 154 hyphenator->hyphenate(word, &result); 155 EXPECT_EQ((size_t)2, result.size()); 156 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 157 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 158 } 159 160 // In Malayalam script text, soft hyphens should not insert a visible hyphen if broken at. 161 TEST(HyphenatorTest, malayalamSoftHyphen) { 162 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 163 const uint16_t word[] = {MALAYALAM_KA, SOFT_HYPHEN, MALAYALAM_KA}; 164 std::vector<HyphenationType> result; 165 hyphenator->hyphenate(word, &result); 166 EXPECT_EQ((size_t)3, result.size()); 167 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 168 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 169 EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]); 170 } 171 172 // In automatically hyphenated Malayalam script text, we should not insert a visible hyphen. 173 TEST(HyphenatorTest, malayalamAutomaticHyphenation) { 174 std::vector<uint8_t> patternData = readWholeFile(malayalamHyph); 175 Hyphenator* hyphenator = Hyphenator::loadBinary(patternData.data(), 2, 2, "en"); 176 const uint16_t word[] = {MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA, MALAYALAM_KA}; 177 std::vector<HyphenationType> result; 178 hyphenator->hyphenate(word, &result); 179 EXPECT_EQ((size_t)5, result.size()); 180 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 181 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 182 EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]); 183 EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[3]); 184 EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]); 185 } 186 187 // In Armenian script text, soft hyphens should insert an Armenian hyphen if broken at. 188 TEST(HyphenatorTest, aremenianSoftHyphen) { 189 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 190 const uint16_t word[] = {ARMENIAN_AYB, SOFT_HYPHEN, ARMENIAN_AYB}; 191 std::vector<HyphenationType> result; 192 hyphenator->hyphenate(word, &result); 193 EXPECT_EQ((size_t)3, result.size()); 194 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 195 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 196 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN, result[2]); 197 } 198 199 // In Hebrew script text, soft hyphens should insert a normal hyphen if broken at, for now. 200 // We may need to change this to maqaf later. 201 TEST(HyphenatorTest, hebrewSoftHyphen) { 202 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 203 const uint16_t word[] = {HEBREW_ALEF, SOFT_HYPHEN, HEBREW_ALEF}; 204 std::vector<HyphenationType> result; 205 hyphenator->hyphenate(word, &result); 206 EXPECT_EQ((size_t)3, result.size()); 207 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 208 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 209 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]); 210 } 211 212 // Soft hyphen between two Arabic letters that join should keep the joining 213 // behavior when broken across lines. 214 TEST(HyphenatorTest, arabicSoftHyphenConnecting) { 215 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 216 const uint16_t word[] = {ARABIC_BEH, SOFT_HYPHEN, ARABIC_BEH}; 217 std::vector<HyphenationType> result; 218 hyphenator->hyphenate(word, &result); 219 EXPECT_EQ((size_t)3, result.size()); 220 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 221 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 222 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[2]); 223 } 224 225 // Arabic letters may be joining on one side, but if it's the wrong side, we 226 // should use the normal hyphen. 227 TEST(HyphenatorTest, arabicSoftHyphenNonConnecting) { 228 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 229 const uint16_t word[] = {ARABIC_ALEF, SOFT_HYPHEN, ARABIC_BEH}; 230 std::vector<HyphenationType> result; 231 hyphenator->hyphenate(word, &result); 232 EXPECT_EQ((size_t)3, result.size()); 233 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 234 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 235 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]); 236 } 237 238 // Skip transparent characters until you find a non-transparent one. 239 TEST(HyphenatorTest, arabicSoftHyphenSkipTransparents) { 240 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 241 const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH}; 242 std::vector<HyphenationType> result; 243 hyphenator->hyphenate(word, &result); 244 EXPECT_EQ((size_t)5, result.size()); 245 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 246 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 247 EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]); 248 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ, result[3]); 249 EXPECT_EQ(HyphenationType::DONT_BREAK, result[4]); 250 } 251 252 // Skip transparent characters until you find a non-transparent one. If we get to one end without 253 // finding anything, we are still non-joining. 254 TEST(HyphenatorTest, arabicSoftHyphenTransparentsAtEnd) { 255 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 256 const uint16_t word[] = {ARABIC_BEH, ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY}; 257 std::vector<HyphenationType> result; 258 hyphenator->hyphenate(word, &result); 259 EXPECT_EQ((size_t)4, result.size()); 260 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 261 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 262 EXPECT_EQ(HyphenationType::DONT_BREAK, result[2]); 263 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[3]); 264 } 265 266 // Skip transparent characters until you find a non-transparent one. If we get to one end without 267 // finding anything, we are still non-joining. 268 TEST(HyphenatorTest, arabicSoftHyphenTransparentsAtStart) { 269 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 270 const uint16_t word[] = {ARABIC_ZWARAKAY, SOFT_HYPHEN, ARABIC_ZWARAKAY, ARABIC_BEH}; 271 std::vector<HyphenationType> result; 272 hyphenator->hyphenate(word, &result); 273 EXPECT_EQ((size_t)4, result.size()); 274 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 275 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 276 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_HYPHEN, result[2]); 277 EXPECT_EQ(HyphenationType::DONT_BREAK, result[3]); 278 } 279 280 // In Unified Canadian Aboriginal script (UCAS) text, soft hyphens should insert a UCAS hyphen. 281 TEST(HyphenatorTest, ucasSoftHyphen) { 282 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 283 const uint16_t word[] = {UCAS_E, SOFT_HYPHEN, UCAS_E}; 284 std::vector<HyphenationType> result; 285 hyphenator->hyphenate(word, &result); 286 EXPECT_EQ((size_t)3, result.size()); 287 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 288 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 289 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]); 290 } 291 292 // Presently, soft hyphen looks at the character after it to determine hyphenation type. This is a 293 // little arbitrary, but let's test it anyway. 294 TEST(HyphenatorTest, mixedScriptSoftHyphen) { 295 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 296 const uint16_t word[] = {'a', SOFT_HYPHEN, UCAS_E}; 297 std::vector<HyphenationType> result; 298 hyphenator->hyphenate(word, &result); 299 EXPECT_EQ((size_t)3, result.size()); 300 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 301 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 302 EXPECT_EQ(HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN, result[2]); 303 } 304 305 // Hard hyphens provide a breaking opportunity with nothing extra inserted. 306 TEST(HyphenatorTest, hardHyphen) { 307 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 308 const uint16_t word[] = {'x', HYPHEN, 'y'}; 309 std::vector<HyphenationType> result; 310 hyphenator->hyphenate(word, &result); 311 EXPECT_EQ((size_t)3, result.size()); 312 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 313 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 314 EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]); 315 } 316 317 // Hyphen-minuses also provide a breaking opportunity with nothing extra inserted. 318 TEST(HyphenatorTest, hyphenMinus) { 319 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 320 const uint16_t word[] = {'x', HYPHEN_MINUS, 'y'}; 321 std::vector<HyphenationType> result; 322 hyphenator->hyphenate(word, &result); 323 EXPECT_EQ((size_t)3, result.size()); 324 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 325 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 326 EXPECT_EQ(HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN, result[2]); 327 } 328 329 // If the word starts with a hard hyphen or hyphen-minus, it doesn't make sense to break 330 // it at that point. 331 TEST(HyphenatorTest, startingHyphenMinus) { 332 Hyphenator* hyphenator = Hyphenator::loadBinary(nullptr, 2, 2, "en"); 333 const uint16_t word[] = {HYPHEN_MINUS, 'y'}; 334 std::vector<HyphenationType> result; 335 hyphenator->hyphenate(word, &result); 336 EXPECT_EQ((size_t)2, result.size()); 337 EXPECT_EQ(HyphenationType::DONT_BREAK, result[0]); 338 EXPECT_EQ(HyphenationType::DONT_BREAK, result[1]); 339 } 340 341 } // namespace minikin 342