1 /* 2 * Copyright 2001-2004 The Apache Software Foundation. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.apache.commons.codec.language; 18 19 import org.apache.commons.codec.EncoderException; 20 import org.apache.commons.codec.StringEncoder; 21 22 /** 23 * Encodes a string into a double metaphone value. 24 * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>. 25 * <ul> 26 * <li>Original Article: <a 27 * href="http://www.cuj.com/documents/s=8038/cuj0006philips/"> 28 * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li> 29 * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip"> 30 * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li> 31 * </ul> 32 * 33 * @author Apache Software Foundation 34 * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $ 35 * 36 * @deprecated Please use {@link java.net.URL#openConnection} instead. 37 * Please visit <a href="http://android-developers.blogspot.com/2011/09/androids-http-clients.html">this webpage</a> 38 * for further details. 39 */ 40 @Deprecated 41 public class DoubleMetaphone implements StringEncoder { 42 43 /** 44 * "Vowels" to test for 45 */ 46 private static final String VOWELS = "AEIOUY"; 47 48 /** 49 * Prefixes when present which are not pronounced 50 */ 51 private static final String[] SILENT_START = 52 { "GN", "KN", "PN", "WR", "PS" }; 53 private static final String[] L_R_N_M_B_H_F_V_W_SPACE = 54 { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; 55 private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = 56 { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; 57 private static final String[] L_T_K_S_N_M_B_Z = 58 { "L", "T", "K", "S", "N", "M", "B", "Z" }; 59 60 /** 61 * Maximum length of an encoding, default is 4 62 */ 63 protected int maxCodeLen = 4; 64 65 /** 66 * Creates an instance of this DoubleMetaphone encoder 67 */ 68 public DoubleMetaphone() { 69 super(); 70 } 71 72 /** 73 * Encode a value with Double Metaphone 74 * 75 * @param value String to encode 76 * @return an encoded string 77 */ 78 public String doubleMetaphone(String value) { 79 return doubleMetaphone(value, false); 80 } 81 82 /** 83 * Encode a value with Double Metaphone, optionally using the alternate 84 * encoding. 85 * 86 * @param value String to encode 87 * @param alternate use alternate encode 88 * @return an encoded string 89 */ 90 public String doubleMetaphone(String value, boolean alternate) { 91 value = cleanInput(value); 92 if (value == null) { 93 return null; 94 } 95 96 boolean slavoGermanic = isSlavoGermanic(value); 97 int index = isSilentStart(value) ? 1 : 0; 98 99 DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen()); 100 101 while (!result.isComplete() && index <= value.length() - 1) { 102 switch (value.charAt(index)) { 103 case 'A': 104 case 'E': 105 case 'I': 106 case 'O': 107 case 'U': 108 case 'Y': 109 index = handleAEIOUY(value, result, index); 110 break; 111 case 'B': 112 result.append('P'); 113 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; 114 break; 115 case '\u00C7': 116 // A C with a Cedilla 117 result.append('S'); 118 index++; 119 break; 120 case 'C': 121 index = handleC(value, result, index); 122 break; 123 case 'D': 124 index = handleD(value, result, index); 125 break; 126 case 'F': 127 result.append('F'); 128 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; 129 break; 130 case 'G': 131 index = handleG(value, result, index, slavoGermanic); 132 break; 133 case 'H': 134 index = handleH(value, result, index); 135 break; 136 case 'J': 137 index = handleJ(value, result, index, slavoGermanic); 138 break; 139 case 'K': 140 result.append('K'); 141 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; 142 break; 143 case 'L': 144 index = handleL(value, result, index); 145 break; 146 case 'M': 147 result.append('M'); 148 index = conditionM0(value, index) ? index + 2 : index + 1; 149 break; 150 case 'N': 151 result.append('N'); 152 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; 153 break; 154 case '\u00D1': 155 // N with a tilde (spanish ene) 156 result.append('N'); 157 index++; 158 break; 159 case 'P': 160 index = handleP(value, result, index); 161 break; 162 case 'Q': 163 result.append('K'); 164 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; 165 break; 166 case 'R': 167 index = handleR(value, result, index, slavoGermanic); 168 break; 169 case 'S': 170 index = handleS(value, result, index, slavoGermanic); 171 break; 172 case 'T': 173 index = handleT(value, result, index); 174 break; 175 case 'V': 176 result.append('F'); 177 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; 178 break; 179 case 'W': 180 index = handleW(value, result, index); 181 break; 182 case 'X': 183 index = handleX(value, result, index); 184 break; 185 case 'Z': 186 index = handleZ(value, result, index, slavoGermanic); 187 break; 188 default: 189 index++; 190 break; 191 } 192 } 193 194 return alternate ? result.getAlternate() : result.getPrimary(); 195 } 196 197 /** 198 * Encode the value using DoubleMetaphone. It will only work if 199 * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>). 200 * 201 * @param obj Object to encode (should be of type String) 202 * @return An encoded Object (will be of type String) 203 * @throws EncoderException encode parameter is not of type String 204 */ 205 public Object encode(Object obj) throws EncoderException { 206 if (!(obj instanceof String)) { 207 throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); 208 } 209 return doubleMetaphone((String) obj); 210 } 211 212 /** 213 * Encode the value using DoubleMetaphone. 214 * 215 * @param value String to encode 216 * @return An encoded String 217 */ 218 public String encode(String value) { 219 return doubleMetaphone(value); 220 } 221 222 /** 223 * Check if the Double Metaphone values of two <code>String</code> values 224 * are equal. 225 * 226 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 227 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 228 * @return <code>true</code> if the encoded <code>String</code>s are equal; 229 * <code>false</code> otherwise. 230 * @see #isDoubleMetaphoneEqual(String,String,boolean) 231 */ 232 public boolean isDoubleMetaphoneEqual(String value1, String value2) { 233 return isDoubleMetaphoneEqual(value1, value2, false); 234 } 235 236 /** 237 * Check if the Double Metaphone values of two <code>String</code> values 238 * are equal, optionally using the alternate value. 239 * 240 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 241 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 242 * @param alternate use the alternate value if <code>true</code>. 243 * @return <code>true</code> if the encoded <code>String</code>s are equal; 244 * <code>false</code> otherwise. 245 */ 246 public boolean isDoubleMetaphoneEqual(String value1, 247 String value2, 248 boolean alternate) { 249 return doubleMetaphone(value1, alternate).equals(doubleMetaphone 250 (value2, alternate)); 251 } 252 253 /** 254 * Returns the maxCodeLen. 255 * @return int 256 */ 257 public int getMaxCodeLen() { 258 return this.maxCodeLen; 259 } 260 261 /** 262 * Sets the maxCodeLen. 263 * @param maxCodeLen The maxCodeLen to set 264 */ 265 public void setMaxCodeLen(int maxCodeLen) { 266 this.maxCodeLen = maxCodeLen; 267 } 268 269 //-- BEGIN HANDLERS --// 270 271 /** 272 * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases 273 */ 274 private int handleAEIOUY(String value, DoubleMetaphoneResult result, int 275 index) { 276 if (index == 0) { 277 result.append('A'); 278 } 279 return index + 1; 280 } 281 282 /** 283 * Handles 'C' cases 284 */ 285 private int handleC(String value, 286 DoubleMetaphoneResult result, 287 int index) { 288 if (conditionC0(value, index)) { // very confusing, moved out 289 result.append('K'); 290 index += 2; 291 } else if (index == 0 && contains(value, index, 6, "CAESAR")) { 292 result.append('S'); 293 index += 2; 294 } else if (contains(value, index, 2, "CH")) { 295 index = handleCH(value, result, index); 296 } else if (contains(value, index, 2, "CZ") && 297 !contains(value, index - 2, 4, "WICZ")) { 298 //-- "Czerny" --// 299 result.append('S', 'X'); 300 index += 2; 301 } else if (contains(value, index + 1, 3, "CIA")) { 302 //-- "focaccia" --// 303 result.append('X'); 304 index += 3; 305 } else if (contains(value, index, 2, "CC") && 306 !(index == 1 && charAt(value, 0) == 'M')) { 307 //-- double "cc" but not "McClelland" --// 308 return handleCC(value, result, index); 309 } else if (contains(value, index, 2, "CK", "CG", "CQ")) { 310 result.append('K'); 311 index += 2; 312 } else if (contains(value, index, 2, "CI", "CE", "CY")) { 313 //-- Italian vs. English --// 314 if (contains(value, index, 3, "CIO", "CIE", "CIA")) { 315 result.append('S', 'X'); 316 } else { 317 result.append('S'); 318 } 319 index += 2; 320 } else { 321 result.append('K'); 322 if (contains(value, index + 1, 2, " C", " Q", " G")) { 323 //-- Mac Caffrey, Mac Gregor --// 324 index += 3; 325 } else if (contains(value, index + 1, 1, "C", "K", "Q") && 326 !contains(value, index + 1, 2, "CE", "CI")) { 327 index += 2; 328 } else { 329 index++; 330 } 331 } 332 333 return index; 334 } 335 336 /** 337 * Handles 'CC' cases 338 */ 339 private int handleCC(String value, 340 DoubleMetaphoneResult result, 341 int index) { 342 if (contains(value, index + 2, 1, "I", "E", "H") && 343 !contains(value, index + 2, 2, "HU")) { 344 //-- "bellocchio" but not "bacchus" --// 345 if ((index == 1 && charAt(value, index - 1) == 'A') || 346 contains(value, index - 1, 5, "UCCEE", "UCCES")) { 347 //-- "accident", "accede", "succeed" --// 348 result.append("KS"); 349 } else { 350 //-- "bacci", "bertucci", other Italian --// 351 result.append('X'); 352 } 353 index += 3; 354 } else { // Pierce's rule 355 result.append('K'); 356 index += 2; 357 } 358 359 return index; 360 } 361 362 /** 363 * Handles 'CH' cases 364 */ 365 private int handleCH(String value, 366 DoubleMetaphoneResult result, 367 int index) { 368 if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael 369 result.append('K', 'X'); 370 return index + 2; 371 } else if (conditionCH0(value, index)) { 372 //-- Greek roots ("chemistry", "chorus", etc.) --// 373 result.append('K'); 374 return index + 2; 375 } else if (conditionCH1(value, index)) { 376 //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// 377 result.append('K'); 378 return index + 2; 379 } else { 380 if (index > 0) { 381 if (contains(value, 0, 2, "MC")) { 382 result.append('K'); 383 } else { 384 result.append('X', 'K'); 385 } 386 } else { 387 result.append('X'); 388 } 389 return index + 2; 390 } 391 } 392 393 /** 394 * Handles 'D' cases 395 */ 396 private int handleD(String value, 397 DoubleMetaphoneResult result, 398 int index) { 399 if (contains(value, index, 2, "DG")) { 400 //-- "Edge" --// 401 if (contains(value, index + 2, 1, "I", "E", "Y")) { 402 result.append('J'); 403 index += 3; 404 //-- "Edgar" --// 405 } else { 406 result.append("TK"); 407 index += 2; 408 } 409 } else if (contains(value, index, 2, "DT", "DD")) { 410 result.append('T'); 411 index += 2; 412 } else { 413 result.append('T'); 414 index++; 415 } 416 return index; 417 } 418 419 /** 420 * Handles 'G' cases 421 */ 422 private int handleG(String value, 423 DoubleMetaphoneResult result, 424 int index, 425 boolean slavoGermanic) { 426 if (charAt(value, index + 1) == 'H') { 427 index = handleGH(value, result, index); 428 } else if (charAt(value, index + 1) == 'N') { 429 if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { 430 result.append("KN", "N"); 431 } else if (!contains(value, index + 2, 2, "EY") && 432 charAt(value, index + 1) != 'Y' && !slavoGermanic) { 433 result.append("N", "KN"); 434 } else { 435 result.append("KN"); 436 } 437 index = index + 2; 438 } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { 439 result.append("KL", "L"); 440 index += 2; 441 } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { 442 //-- -ges-, -gep-, -gel-, -gie- at beginning --// 443 result.append('K', 'J'); 444 index += 2; 445 } else if ((contains(value, index + 1, 2, "ER") || 446 charAt(value, index + 1) == 'Y') && 447 !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && 448 !contains(value, index - 1, 1, "E", "I") && 449 !contains(value, index - 1, 3, "RGY", "OGY")) { 450 //-- -ger-, -gy- --// 451 result.append('K', 'J'); 452 index += 2; 453 } else if (contains(value, index + 1, 1, "E", "I", "Y") || 454 contains(value, index - 1, 4, "AGGI", "OGGI")) { 455 //-- Italian "biaggi" --// 456 if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) { 457 //-- obvious germanic --// 458 result.append('K'); 459 } else if (contains(value, index + 1, 4, "IER")) { 460 result.append('J'); 461 } else { 462 result.append('J', 'K'); 463 } 464 index += 2; 465 } else if (charAt(value, index + 1) == 'G') { 466 index += 2; 467 result.append('K'); 468 } else { 469 index++; 470 result.append('K'); 471 } 472 return index; 473 } 474 475 /** 476 * Handles 'GH' cases 477 */ 478 private int handleGH(String value, 479 DoubleMetaphoneResult result, 480 int index) { 481 if (index > 0 && !isVowel(charAt(value, index - 1))) { 482 result.append('K'); 483 index += 2; 484 } else if (index == 0) { 485 if (charAt(value, index + 2) == 'I') { 486 result.append('J'); 487 } else { 488 result.append('K'); 489 } 490 index += 2; 491 } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) || 492 (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) || 493 (index > 3 && contains(value, index - 4, 1, "B", "H"))) { 494 //-- Parker's rule (with some further refinements) - "hugh" 495 index += 2; 496 } else { 497 if (index > 2 && charAt(value, index - 1) == 'U' && 498 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { 499 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" 500 result.append('F'); 501 } else if (index > 0 && charAt(value, index - 1) != 'I') { 502 result.append('K'); 503 } 504 index += 2; 505 } 506 return index; 507 } 508 509 /** 510 * Handles 'H' cases 511 */ 512 private int handleH(String value, 513 DoubleMetaphoneResult result, 514 int index) { 515 //-- only keep if first & before vowel or between 2 vowels --// 516 if ((index == 0 || isVowel(charAt(value, index - 1))) && 517 isVowel(charAt(value, index + 1))) { 518 result.append('H'); 519 index += 2; 520 //-- also takes car of "HH" --// 521 } else { 522 index++; 523 } 524 return index; 525 } 526 527 /** 528 * Handles 'J' cases 529 */ 530 private int handleJ(String value, DoubleMetaphoneResult result, int index, 531 boolean slavoGermanic) { 532 if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { 533 //-- obvious Spanish, "Jose", "San Jacinto" --// 534 if ((index == 0 && (charAt(value, index + 4) == ' ') || 535 value.length() == 4) || contains(value, 0, 4, "SAN ")) { 536 result.append('H'); 537 } else { 538 result.append('J', 'H'); 539 } 540 index++; 541 } else { 542 if (index == 0 && !contains(value, index, 4, "JOSE")) { 543 result.append('J', 'A'); 544 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && 545 (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { 546 result.append('J', 'H'); 547 } else if (index == value.length() - 1) { 548 result.append('J', ' '); 549 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) { 550 result.append('J'); 551 } 552 553 if (charAt(value, index + 1) == 'J') { 554 index += 2; 555 } else { 556 index++; 557 } 558 } 559 return index; 560 } 561 562 /** 563 * Handles 'L' cases 564 */ 565 private int handleL(String value, 566 DoubleMetaphoneResult result, 567 int index) { 568 result.append('L'); 569 if (charAt(value, index + 1) == 'L') { 570 if (conditionL0(value, index)) { 571 result.appendAlternate(' '); 572 } 573 index += 2; 574 } else { 575 index++; 576 } 577 return index; 578 } 579 580 /** 581 * Handles 'P' cases 582 */ 583 private int handleP(String value, 584 DoubleMetaphoneResult result, 585 int index) { 586 if (charAt(value, index + 1) == 'H') { 587 result.append('F'); 588 index += 2; 589 } else { 590 result.append('P'); 591 index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; 592 } 593 return index; 594 } 595 596 /** 597 * Handles 'R' cases 598 */ 599 private int handleR(String value, 600 DoubleMetaphoneResult result, 601 int index, 602 boolean slavoGermanic) { 603 if (index == value.length() - 1 && !slavoGermanic && 604 contains(value, index - 2, 2, "IE") && 605 !contains(value, index - 4, 2, "ME", "MA")) { 606 result.appendAlternate('R'); 607 } else { 608 result.append('R'); 609 } 610 return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; 611 } 612 613 /** 614 * Handles 'S' cases 615 */ 616 private int handleS(String value, 617 DoubleMetaphoneResult result, 618 int index, 619 boolean slavoGermanic) { 620 if (contains(value, index - 1, 3, "ISL", "YSL")) { 621 //-- special cases "island", "isle", "carlisle", "carlysle" --// 622 index++; 623 } else if (index == 0 && contains(value, index, 5, "SUGAR")) { 624 //-- special case "sugar-" --// 625 result.append('X', 'S'); 626 index++; 627 } else if (contains(value, index, 2, "SH")) { 628 if (contains(value, index + 1, 4, 629 "HEIM", "HOEK", "HOLM", "HOLZ")) { 630 //-- germanic --// 631 result.append('S'); 632 } else { 633 result.append('X'); 634 } 635 index += 2; 636 } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { 637 //-- Italian and Armenian --// 638 if (slavoGermanic) { 639 result.append('S'); 640 } else { 641 result.append('S', 'X'); 642 } 643 index += 3; 644 } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) { 645 //-- german & anglicisations, e.g. "smith" match "schmidt" // 646 // "snider" match "schneider" --// 647 //-- also, -sz- in slavic language altho in hungarian it // 648 // is pronounced "s" --// 649 result.append('S', 'X'); 650 index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; 651 } else if (contains(value, index, 2, "SC")) { 652 index = handleSC(value, result, index); 653 } else { 654 if (index == value.length() - 1 && contains(value, index - 2, 655 2, "AI", "OI")){ 656 //-- french e.g. "resnais", "artois" --// 657 result.appendAlternate('S'); 658 } else { 659 result.append('S'); 660 } 661 index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; 662 } 663 return index; 664 } 665 666 /** 667 * Handles 'SC' cases 668 */ 669 private int handleSC(String value, 670 DoubleMetaphoneResult result, 671 int index) { 672 if (charAt(value, index + 2) == 'H') { 673 //-- Schlesinger's rule --// 674 if (contains(value, index + 3, 675 2, "OO", "ER", "EN", "UY", "ED", "EM")) { 676 //-- Dutch origin, e.g. "school", "schooner" --// 677 if (contains(value, index + 3, 2, "ER", "EN")) { 678 //-- "schermerhorn", "schenker" --// 679 result.append("X", "SK"); 680 } else { 681 result.append("SK"); 682 } 683 } else { 684 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { 685 result.append('X', 'S'); 686 } else { 687 result.append('X'); 688 } 689 } 690 } else if (contains(value, index + 2, 1, "I", "E", "Y")) { 691 result.append('S'); 692 } else { 693 result.append("SK"); 694 } 695 return index + 3; 696 } 697 698 /** 699 * Handles 'T' cases 700 */ 701 private int handleT(String value, 702 DoubleMetaphoneResult result, 703 int index) { 704 if (contains(value, index, 4, "TION")) { 705 result.append('X'); 706 index += 3; 707 } else if (contains(value, index, 3, "TIA", "TCH")) { 708 result.append('X'); 709 index += 3; 710 } else if (contains(value, index, 2, "TH") || contains(value, index, 711 3, "TTH")) { 712 if (contains(value, index + 2, 2, "OM", "AM") || 713 //-- special case "thomas", "thames" or germanic --// 714 contains(value, 0, 4, "VAN ", "VON ") || 715 contains(value, 0, 3, "SCH")) { 716 result.append('T'); 717 } else { 718 result.append('0', 'T'); 719 } 720 index += 2; 721 } else { 722 result.append('T'); 723 index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; 724 } 725 return index; 726 } 727 728 /** 729 * Handles 'W' cases 730 */ 731 private int handleW(String value, 732 DoubleMetaphoneResult result, 733 int index) { 734 if (contains(value, index, 2, "WR")) { 735 //-- can also be in middle of word --// 736 result.append('R'); 737 index += 2; 738 } else { 739 if (index == 0 && (isVowel(charAt(value, index + 1)) || 740 contains(value, index, 2, "WH"))) { 741 if (isVowel(charAt(value, index + 1))) { 742 //-- Wasserman should match Vasserman --// 743 result.append('A', 'F'); 744 } else { 745 //-- need Uomo to match Womo --// 746 result.append('A'); 747 } 748 index++; 749 } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) || 750 contains(value, index - 1, 751 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || 752 contains(value, 0, 3, "SCH")) { 753 //-- Arnow should match Arnoff --// 754 result.appendAlternate('F'); 755 index++; 756 } else if (contains(value, index, 4, "WICZ", "WITZ")) { 757 //-- Polish e.g. "filipowicz" --// 758 result.append("TS", "FX"); 759 index += 4; 760 } else { 761 index++; 762 } 763 } 764 return index; 765 } 766 767 /** 768 * Handles 'X' cases 769 */ 770 private int handleX(String value, 771 DoubleMetaphoneResult result, 772 int index) { 773 if (index == 0) { 774 result.append('S'); 775 index++; 776 } else { 777 if (!((index == value.length() - 1) && 778 (contains(value, index - 3, 3, "IAU", "EAU") || 779 contains(value, index - 2, 2, "AU", "OU")))) { 780 //-- French e.g. breaux --// 781 result.append("KS"); 782 } 783 index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; 784 } 785 return index; 786 } 787 788 /** 789 * Handles 'Z' cases 790 */ 791 private int handleZ(String value, DoubleMetaphoneResult result, int index, 792 boolean slavoGermanic) { 793 if (charAt(value, index + 1) == 'H') { 794 //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --// 795 result.append('J'); 796 index += 2; 797 } else { 798 if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) { 799 result.append("S", "TS"); 800 } else { 801 result.append('S'); 802 } 803 index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; 804 } 805 return index; 806 } 807 808 //-- BEGIN CONDITIONS --// 809 810 /** 811 * Complex condition 0 for 'C' 812 */ 813 private boolean conditionC0(String value, int index) { 814 if (contains(value, index, 4, "CHIA")) { 815 return true; 816 } else if (index <= 1) { 817 return false; 818 } else if (isVowel(charAt(value, index - 2))) { 819 return false; 820 } else if (!contains(value, index - 1, 3, "ACH")) { 821 return false; 822 } else { 823 char c = charAt(value, index + 2); 824 return (c != 'I' && c != 'E') 825 || contains(value, index - 2, 6, "BACHER", "MACHER"); 826 } 827 } 828 829 /** 830 * Complex condition 0 for 'CH' 831 */ 832 private boolean conditionCH0(String value, int index) { 833 if (index != 0) { 834 return false; 835 } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") && 836 !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { 837 return false; 838 } else if (contains(value, 0, 5, "CHORE")) { 839 return false; 840 } else { 841 return true; 842 } 843 } 844 845 /** 846 * Complex condition 1 for 'CH' 847 */ 848 private boolean conditionCH1(String value, int index) { 849 return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 850 3, "SCH")) || 851 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || 852 contains(value, index + 2, 1, "T", "S") || 853 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && 854 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1))); 855 } 856 857 /** 858 * Complex condition 0 for 'L' 859 */ 860 private boolean conditionL0(String value, int index) { 861 if (index == value.length() - 3 && 862 contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { 863 return true; 864 } else if ((contains(value, index - 1, 2, "AS", "OS") || 865 contains(value, value.length() - 1, 1, "A", "O")) && 866 contains(value, index - 1, 4, "ALLE")) { 867 return true; 868 } else { 869 return false; 870 } 871 } 872 873 /** 874 * Complex condition 0 for 'M' 875 */ 876 private boolean conditionM0(String value, int index) { 877 if (charAt(value, index + 1) == 'M') { 878 return true; 879 } 880 return contains(value, index - 1, 3, "UMB") 881 && ((index + 1) == value.length() - 1 || contains(value, 882 index + 2, 2, "ER")); 883 } 884 885 //-- BEGIN HELPER FUNCTIONS --// 886 887 /** 888 * Determines whether or not a value is of slavo-germanic orgin. A value is 889 * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'. 890 */ 891 private boolean isSlavoGermanic(String value) { 892 return value.indexOf('W') > -1 || value.indexOf('K') > -1 || 893 value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1; 894 } 895 896 /** 897 * Determines whether or not a character is a vowel or not 898 */ 899 private boolean isVowel(char ch) { 900 return VOWELS.indexOf(ch) != -1; 901 } 902 903 /** 904 * Determines whether or not the value starts with a silent letter. It will 905 * return <code>true</code> if the value starts with any of 'GN', 'KN', 906 * 'PN', 'WR' or 'PS'. 907 */ 908 private boolean isSilentStart(String value) { 909 boolean result = false; 910 for (int i = 0; i < SILENT_START.length; i++) { 911 if (value.startsWith(SILENT_START[i])) { 912 result = true; 913 break; 914 } 915 } 916 return result; 917 } 918 919 /** 920 * Cleans the input 921 */ 922 private String cleanInput(String input) { 923 if (input == null) { 924 return null; 925 } 926 input = input.trim(); 927 if (input.length() == 0) { 928 return null; 929 } 930 return input.toUpperCase(); 931 } 932 933 /** 934 * Gets the character at index <code>index</code> if available, otherwise 935 * it returns <code>Character.MIN_VALUE</code> so that there is some sort 936 * of a default 937 */ 938 protected char charAt(String value, int index) { 939 if (index < 0 || index >= value.length()) { 940 return Character.MIN_VALUE; 941 } 942 return value.charAt(index); 943 } 944 945 /** 946 * Shortcut method with 1 criteria 947 */ 948 private static boolean contains(String value, int start, int length, 949 String criteria) { 950 return contains(value, start, length, 951 new String[] { criteria }); 952 } 953 954 /** 955 * Shortcut method with 2 criteria 956 */ 957 private static boolean contains(String value, int start, int length, 958 String criteria1, String criteria2) { 959 return contains(value, start, length, 960 new String[] { criteria1, criteria2 }); 961 } 962 963 /** 964 * Shortcut method with 3 criteria 965 */ 966 private static boolean contains(String value, int start, int length, 967 String criteria1, String criteria2, 968 String criteria3) { 969 return contains(value, start, length, 970 new String[] { criteria1, criteria2, criteria3 }); 971 } 972 973 /** 974 * Shortcut method with 4 criteria 975 */ 976 private static boolean contains(String value, int start, int length, 977 String criteria1, String criteria2, 978 String criteria3, String criteria4) { 979 return contains(value, start, length, 980 new String[] { criteria1, criteria2, criteria3, 981 criteria4 }); 982 } 983 984 /** 985 * Shortcut method with 5 criteria 986 */ 987 private static boolean contains(String value, int start, int length, 988 String criteria1, String criteria2, 989 String criteria3, String criteria4, 990 String criteria5) { 991 return contains(value, start, length, 992 new String[] { criteria1, criteria2, criteria3, 993 criteria4, criteria5 }); 994 } 995 996 /** 997 * Shortcut method with 6 criteria 998 */ 999 private static boolean contains(String value, int start, int length, 1000 String criteria1, String criteria2, 1001 String criteria3, String criteria4, 1002 String criteria5, String criteria6) { 1003 return contains(value, start, length, 1004 new String[] { criteria1, criteria2, criteria3, 1005 criteria4, criteria5, criteria6 }); 1006 } 1007 1008 /** 1009 * Determines whether <code>value</code> contains any of the criteria 1010 starting 1011 * at index <code>start</code> and matching up to length <code>length</code> 1012 */ 1013 protected static boolean contains(String value, int start, int length, 1014 String[] criteria) { 1015 boolean result = false; 1016 if (start >= 0 && start + length <= value.length()) { 1017 String target = value.substring(start, start + length); 1018 1019 for (int i = 0; i < criteria.length; i++) { 1020 if (target.equals(criteria[i])) { 1021 result = true; 1022 break; 1023 } 1024 } 1025 } 1026 return result; 1027 } 1028 1029 //-- BEGIN INNER CLASSES --// 1030 1031 /** 1032 * Inner class for storing results, since there is the optional alternate 1033 * encoding. 1034 */ 1035 public class DoubleMetaphoneResult { 1036 1037 private StringBuffer primary = new StringBuffer(getMaxCodeLen()); 1038 private StringBuffer alternate = new StringBuffer(getMaxCodeLen()); 1039 private int maxLength; 1040 1041 public DoubleMetaphoneResult(int maxLength) { 1042 this.maxLength = maxLength; 1043 } 1044 1045 public void append(char value) { 1046 appendPrimary(value); 1047 appendAlternate(value); 1048 } 1049 1050 public void append(char primary, char alternate) { 1051 appendPrimary(primary); 1052 appendAlternate(alternate); 1053 } 1054 1055 public void appendPrimary(char value) { 1056 if (this.primary.length() < this.maxLength) { 1057 this.primary.append(value); 1058 } 1059 } 1060 1061 public void appendAlternate(char value) { 1062 if (this.alternate.length() < this.maxLength) { 1063 this.alternate.append(value); 1064 } 1065 } 1066 1067 public void append(String value) { 1068 appendPrimary(value); 1069 appendAlternate(value); 1070 } 1071 1072 public void append(String primary, String alternate) { 1073 appendPrimary(primary); 1074 appendAlternate(alternate); 1075 } 1076 1077 public void appendPrimary(String value) { 1078 int addChars = this.maxLength - this.primary.length(); 1079 if (value.length() <= addChars) { 1080 this.primary.append(value); 1081 } else { 1082 this.primary.append(value.substring(0, addChars)); 1083 } 1084 } 1085 1086 public void appendAlternate(String value) { 1087 int addChars = this.maxLength - this.alternate.length(); 1088 if (value.length() <= addChars) { 1089 this.alternate.append(value); 1090 } else { 1091 this.alternate.append(value.substring(0, addChars)); 1092 } 1093 } 1094 1095 public String getPrimary() { 1096 return this.primary.toString(); 1097 } 1098 1099 public String getAlternate() { 1100 return this.alternate.toString(); 1101 } 1102 1103 public boolean isComplete() { 1104 return this.primary.length() >= this.maxLength && 1105 this.alternate.length() >= this.maxLength; 1106 } 1107 } 1108 } 1109