Home | History | Annotate | Download | only in language
      1 /*
      2  * Copyright 2001-2004 The Apache Software Foundation.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package org.apache.commons.codec.language;
     18 
     19 import org.apache.commons.codec.EncoderException;
     20 import org.apache.commons.codec.StringEncoder;
     21 
     22 /**
     23  * Encodes a string into a double metaphone value.
     24  * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>.
     25  * <ul>
     26  * <li>Original Article: <a
     27  * href="http://www.cuj.com/documents/s=8038/cuj0006philips/">
     28  * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li>
     29  * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip">
     30  * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li>
     31  * </ul>
     32  *
     33  * @author Apache Software Foundation
     34  * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $
     35  */
     36 public class DoubleMetaphone implements StringEncoder {
     37 
     38     /**
     39      * "Vowels" to test for
     40      */
     41     private static final String VOWELS = "AEIOUY";
     42 
     43     /**
     44      * Prefixes when present which are not pronounced
     45      */
     46     private static final String[] SILENT_START =
     47     { "GN", "KN", "PN", "WR", "PS" };
     48     private static final String[] L_R_N_M_B_H_F_V_W_SPACE =
     49     { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
     50     private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
     51     { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
     52     private static final String[] L_T_K_S_N_M_B_Z =
     53     { "L", "T", "K", "S", "N", "M", "B", "Z" };
     54 
     55     /**
     56      * Maximum length of an encoding, default is 4
     57      */
     58     protected int maxCodeLen = 4;
     59 
     60     /**
     61      * Creates an instance of this DoubleMetaphone encoder
     62      */
     63     public DoubleMetaphone() {
     64         super();
     65     }
     66 
     67     /**
     68      * Encode a value with Double Metaphone
     69      *
     70      * @param value String to encode
     71      * @return an encoded string
     72      */
     73     public String doubleMetaphone(String value) {
     74         return doubleMetaphone(value, false);
     75     }
     76 
     77     /**
     78      * Encode a value with Double Metaphone, optionally using the alternate
     79      * encoding.
     80      *
     81      * @param value String to encode
     82      * @param alternate use alternate encode
     83      * @return an encoded string
     84      */
     85     public String doubleMetaphone(String value, boolean alternate) {
     86         value = cleanInput(value);
     87         if (value == null) {
     88             return null;
     89         }
     90 
     91         boolean slavoGermanic = isSlavoGermanic(value);
     92         int index = isSilentStart(value) ? 1 : 0;
     93 
     94         DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen());
     95 
     96         while (!result.isComplete() && index <= value.length() - 1) {
     97             switch (value.charAt(index)) {
     98             case 'A':
     99             case 'E':
    100             case 'I':
    101             case 'O':
    102             case 'U':
    103             case 'Y':
    104                 index = handleAEIOUY(value, result, index);
    105                 break;
    106             case 'B':
    107                 result.append('P');
    108                 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
    109                 break;
    110             case '\u00C7':
    111                 // A C with a Cedilla
    112                 result.append('S');
    113                 index++;
    114                 break;
    115             case 'C':
    116                 index = handleC(value, result, index);
    117                 break;
    118             case 'D':
    119                 index = handleD(value, result, index);
    120                 break;
    121             case 'F':
    122                 result.append('F');
    123                 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
    124                 break;
    125             case 'G':
    126                 index = handleG(value, result, index, slavoGermanic);
    127                 break;
    128             case 'H':
    129                 index = handleH(value, result, index);
    130                 break;
    131             case 'J':
    132                 index = handleJ(value, result, index, slavoGermanic);
    133                 break;
    134             case 'K':
    135                 result.append('K');
    136                 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
    137                 break;
    138             case 'L':
    139                 index = handleL(value, result, index);
    140                 break;
    141             case 'M':
    142                 result.append('M');
    143                 index = conditionM0(value, index) ? index + 2 : index + 1;
    144                 break;
    145             case 'N':
    146                 result.append('N');
    147                 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
    148                 break;
    149             case '\u00D1':
    150                 // N with a tilde (spanish ene)
    151                 result.append('N');
    152                 index++;
    153                 break;
    154             case 'P':
    155                 index = handleP(value, result, index);
    156                 break;
    157             case 'Q':
    158                 result.append('K');
    159                 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
    160                 break;
    161             case 'R':
    162                 index = handleR(value, result, index, slavoGermanic);
    163                 break;
    164             case 'S':
    165                 index = handleS(value, result, index, slavoGermanic);
    166                 break;
    167             case 'T':
    168                 index = handleT(value, result, index);
    169                 break;
    170             case 'V':
    171                 result.append('F');
    172                 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
    173                 break;
    174             case 'W':
    175                 index = handleW(value, result, index);
    176                 break;
    177             case 'X':
    178                 index = handleX(value, result, index);
    179                 break;
    180             case 'Z':
    181                 index = handleZ(value, result, index, slavoGermanic);
    182                 break;
    183             default:
    184                 index++;
    185                 break;
    186             }
    187         }
    188 
    189         return alternate ? result.getAlternate() : result.getPrimary();
    190     }
    191 
    192     /**
    193      * Encode the value using DoubleMetaphone.  It will only work if
    194      * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
    195      *
    196      * @param obj Object to encode (should be of type String)
    197      * @return An encoded Object (will be of type String)
    198      * @throws EncoderException encode parameter is not of type String
    199      */
    200     public Object encode(Object obj) throws EncoderException {
    201         if (!(obj instanceof String)) {
    202             throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
    203         }
    204         return doubleMetaphone((String) obj);
    205     }
    206 
    207     /**
    208      * Encode the value using DoubleMetaphone.
    209      *
    210      * @param value String to encode
    211      * @return An encoded String
    212      */
    213     public String encode(String value) {
    214         return doubleMetaphone(value);
    215     }
    216 
    217     /**
    218      * Check if the Double Metaphone values of two <code>String</code> values
    219      * are equal.
    220      *
    221      * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
    222      * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
    223      * @return <code>true</code> if the encoded <code>String</code>s are equal;
    224      *          <code>false</code> otherwise.
    225      * @see #isDoubleMetaphoneEqual(String,String,boolean)
    226      */
    227     public boolean isDoubleMetaphoneEqual(String value1, String value2) {
    228         return isDoubleMetaphoneEqual(value1, value2, false);
    229     }
    230 
    231     /**
    232      * Check if the Double Metaphone values of two <code>String</code> values
    233      * are equal, optionally using the alternate value.
    234      *
    235      * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
    236      * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
    237      * @param alternate use the alternate value if <code>true</code>.
    238      * @return <code>true</code> if the encoded <code>String</code>s are equal;
    239      *          <code>false</code> otherwise.
    240      */
    241     public boolean isDoubleMetaphoneEqual(String value1,
    242                                           String value2,
    243                                           boolean alternate) {
    244         return doubleMetaphone(value1, alternate).equals(doubleMetaphone
    245                                                          (value2, alternate));
    246     }
    247 
    248     /**
    249      * Returns the maxCodeLen.
    250      * @return int
    251      */
    252     public int getMaxCodeLen() {
    253         return this.maxCodeLen;
    254     }
    255 
    256     /**
    257      * Sets the maxCodeLen.
    258      * @param maxCodeLen The maxCodeLen to set
    259      */
    260     public void setMaxCodeLen(int maxCodeLen) {
    261         this.maxCodeLen = maxCodeLen;
    262     }
    263 
    264     //-- BEGIN HANDLERS --//
    265 
    266     /**
    267      * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases
    268      */
    269     private int handleAEIOUY(String value, DoubleMetaphoneResult result, int
    270                              index) {
    271         if (index == 0) {
    272             result.append('A');
    273         }
    274         return index + 1;
    275     }
    276 
    277     /**
    278      * Handles 'C' cases
    279      */
    280     private int handleC(String value,
    281                         DoubleMetaphoneResult result,
    282                         int index) {
    283         if (conditionC0(value, index)) {  // very confusing, moved out
    284             result.append('K');
    285             index += 2;
    286         } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
    287             result.append('S');
    288             index += 2;
    289         } else if (contains(value, index, 2, "CH")) {
    290             index = handleCH(value, result, index);
    291         } else if (contains(value, index, 2, "CZ") &&
    292                    !contains(value, index - 2, 4, "WICZ")) {
    293             //-- "Czerny" --//
    294             result.append('S', 'X');
    295             index += 2;
    296         } else if (contains(value, index + 1, 3, "CIA")) {
    297             //-- "focaccia" --//
    298             result.append('X');
    299             index += 3;
    300         } else if (contains(value, index, 2, "CC") &&
    301                    !(index == 1 && charAt(value, 0) == 'M')) {
    302             //-- double "cc" but not "McClelland" --//
    303             return handleCC(value, result, index);
    304         } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
    305             result.append('K');
    306             index += 2;
    307         } else if (contains(value, index, 2, "CI", "CE", "CY")) {
    308             //-- Italian vs. English --//
    309             if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
    310                 result.append('S', 'X');
    311             } else {
    312                 result.append('S');
    313             }
    314             index += 2;
    315         } else {
    316             result.append('K');
    317             if (contains(value, index + 1, 2, " C", " Q", " G")) {
    318                 //-- Mac Caffrey, Mac Gregor --//
    319                 index += 3;
    320             } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
    321                        !contains(value, index + 1, 2, "CE", "CI")) {
    322                 index += 2;
    323             } else {
    324                 index++;
    325             }
    326         }
    327 
    328         return index;
    329     }
    330 
    331     /**
    332      * Handles 'CC' cases
    333      */
    334     private int handleCC(String value,
    335                          DoubleMetaphoneResult result,
    336                          int index) {
    337         if (contains(value, index + 2, 1, "I", "E", "H") &&
    338             !contains(value, index + 2, 2, "HU")) {
    339             //-- "bellocchio" but not "bacchus" --//
    340             if ((index == 1 && charAt(value, index - 1) == 'A') ||
    341                 contains(value, index - 1, 5, "UCCEE", "UCCES")) {
    342                 //-- "accident", "accede", "succeed" --//
    343                 result.append("KS");
    344             } else {
    345                 //-- "bacci", "bertucci", other Italian --//
    346                 result.append('X');
    347             }
    348             index += 3;
    349         } else {    // Pierce's rule
    350             result.append('K');
    351             index += 2;
    352         }
    353 
    354         return index;
    355     }
    356 
    357     /**
    358      * Handles 'CH' cases
    359      */
    360     private int handleCH(String value,
    361                          DoubleMetaphoneResult result,
    362                          int index) {
    363         if (index > 0 && contains(value, index, 4, "CHAE")) {   // Michael
    364             result.append('K', 'X');
    365             return index + 2;
    366         } else if (conditionCH0(value, index)) {
    367             //-- Greek roots ("chemistry", "chorus", etc.) --//
    368             result.append('K');
    369             return index + 2;
    370         } else if (conditionCH1(value, index)) {
    371             //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
    372             result.append('K');
    373             return index + 2;
    374         } else {
    375             if (index > 0) {
    376                 if (contains(value, 0, 2, "MC")) {
    377                     result.append('K');
    378                 } else {
    379                     result.append('X', 'K');
    380                 }
    381             } else {
    382                 result.append('X');
    383             }
    384             return index + 2;
    385         }
    386     }
    387 
    388     /**
    389      * Handles 'D' cases
    390      */
    391     private int handleD(String value,
    392                         DoubleMetaphoneResult result,
    393                         int index) {
    394         if (contains(value, index, 2, "DG")) {
    395             //-- "Edge" --//
    396             if (contains(value, index + 2, 1, "I", "E", "Y")) {
    397                 result.append('J');
    398                 index += 3;
    399                 //-- "Edgar" --//
    400             } else {
    401                 result.append("TK");
    402                 index += 2;
    403             }
    404         } else if (contains(value, index, 2, "DT", "DD")) {
    405             result.append('T');
    406             index += 2;
    407         } else {
    408             result.append('T');
    409             index++;
    410         }
    411         return index;
    412     }
    413 
    414     /**
    415      * Handles 'G' cases
    416      */
    417     private int handleG(String value,
    418                         DoubleMetaphoneResult result,
    419                         int index,
    420                         boolean slavoGermanic) {
    421         if (charAt(value, index + 1) == 'H') {
    422             index = handleGH(value, result, index);
    423         } else if (charAt(value, index + 1) == 'N') {
    424             if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
    425                 result.append("KN", "N");
    426             } else if (!contains(value, index + 2, 2, "EY") &&
    427                        charAt(value, index + 1) != 'Y' && !slavoGermanic) {
    428                 result.append("N", "KN");
    429             } else {
    430                 result.append("KN");
    431             }
    432             index = index + 2;
    433         } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
    434             result.append("KL", "L");
    435             index += 2;
    436         } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
    437             //-- -ges-, -gep-, -gel-, -gie- at beginning --//
    438             result.append('K', 'J');
    439             index += 2;
    440         } else if ((contains(value, index + 1, 2, "ER") ||
    441                     charAt(value, index + 1) == 'Y') &&
    442                    !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
    443                    !contains(value, index - 1, 1, "E", "I") &&
    444                    !contains(value, index - 1, 3, "RGY", "OGY")) {
    445             //-- -ger-, -gy- --//
    446             result.append('K', 'J');
    447             index += 2;
    448         } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
    449                    contains(value, index - 1, 4, "AGGI", "OGGI")) {
    450             //-- Italian "biaggi" --//
    451             if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) {
    452                 //-- obvious germanic --//
    453                 result.append('K');
    454             } else if (contains(value, index + 1, 4, "IER")) {
    455                 result.append('J');
    456             } else {
    457                 result.append('J', 'K');
    458             }
    459             index += 2;
    460         } else if (charAt(value, index + 1) == 'G') {
    461             index += 2;
    462             result.append('K');
    463         } else {
    464             index++;
    465             result.append('K');
    466         }
    467         return index;
    468     }
    469 
    470     /**
    471      * Handles 'GH' cases
    472      */
    473     private int handleGH(String value,
    474                          DoubleMetaphoneResult result,
    475                          int index) {
    476         if (index > 0 && !isVowel(charAt(value, index - 1))) {
    477             result.append('K');
    478             index += 2;
    479         } else if (index == 0) {
    480             if (charAt(value, index + 2) == 'I') {
    481                 result.append('J');
    482             } else {
    483                 result.append('K');
    484             }
    485             index += 2;
    486         } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) ||
    487                    (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) ||
    488                    (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
    489             //-- Parker's rule (with some further refinements) - "hugh"
    490             index += 2;
    491         } else {
    492             if (index > 2 && charAt(value, index - 1) == 'U' &&
    493                 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
    494                 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
    495                 result.append('F');
    496             } else if (index > 0 && charAt(value, index - 1) != 'I') {
    497                 result.append('K');
    498             }
    499             index += 2;
    500         }
    501         return index;
    502     }
    503 
    504     /**
    505      * Handles 'H' cases
    506      */
    507     private int handleH(String value,
    508                         DoubleMetaphoneResult result,
    509                         int index) {
    510         //-- only keep if first & before vowel or between 2 vowels --//
    511         if ((index == 0 || isVowel(charAt(value, index - 1))) &&
    512             isVowel(charAt(value, index + 1))) {
    513             result.append('H');
    514             index += 2;
    515             //-- also takes car of "HH" --//
    516         } else {
    517             index++;
    518         }
    519         return index;
    520     }
    521 
    522     /**
    523      * Handles 'J' cases
    524      */
    525     private int handleJ(String value, DoubleMetaphoneResult result, int index,
    526                         boolean slavoGermanic) {
    527         if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
    528                 //-- obvious Spanish, "Jose", "San Jacinto" --//
    529                 if ((index == 0 && (charAt(value, index + 4) == ' ') ||
    530                      value.length() == 4) || contains(value, 0, 4, "SAN ")) {
    531                     result.append('H');
    532                 } else {
    533                     result.append('J', 'H');
    534                 }
    535                 index++;
    536             } else {
    537                 if (index == 0 && !contains(value, index, 4, "JOSE")) {
    538                     result.append('J', 'A');
    539                 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
    540                               (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
    541                     result.append('J', 'H');
    542                 } else if (index == value.length() - 1) {
    543                     result.append('J', ' ');
    544                 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) {
    545                     result.append('J');
    546                 }
    547 
    548                 if (charAt(value, index + 1) == 'J') {
    549                     index += 2;
    550                 } else {
    551                     index++;
    552                 }
    553             }
    554         return index;
    555     }
    556 
    557     /**
    558      * Handles 'L' cases
    559      */
    560     private int handleL(String value,
    561                         DoubleMetaphoneResult result,
    562                         int index) {
    563         result.append('L');
    564         if (charAt(value, index + 1) == 'L') {
    565             if (conditionL0(value, index)) {
    566                 result.appendAlternate(' ');
    567             }
    568             index += 2;
    569         } else {
    570             index++;
    571         }
    572         return index;
    573     }
    574 
    575     /**
    576      * Handles 'P' cases
    577      */
    578     private int handleP(String value,
    579                         DoubleMetaphoneResult result,
    580                         int index) {
    581         if (charAt(value, index + 1) == 'H') {
    582             result.append('F');
    583             index += 2;
    584         } else {
    585             result.append('P');
    586             index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
    587         }
    588         return index;
    589     }
    590 
    591     /**
    592      * Handles 'R' cases
    593      */
    594     private int handleR(String value,
    595                         DoubleMetaphoneResult result,
    596                         int index,
    597                         boolean slavoGermanic) {
    598         if (index == value.length() - 1 && !slavoGermanic &&
    599             contains(value, index - 2, 2, "IE") &&
    600             !contains(value, index - 4, 2, "ME", "MA")) {
    601             result.appendAlternate('R');
    602         } else {
    603             result.append('R');
    604         }
    605         return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
    606     }
    607 
    608     /**
    609      * Handles 'S' cases
    610      */
    611     private int handleS(String value,
    612                         DoubleMetaphoneResult result,
    613                         int index,
    614                         boolean slavoGermanic) {
    615         if (contains(value, index - 1, 3, "ISL", "YSL")) {
    616             //-- special cases "island", "isle", "carlisle", "carlysle" --//
    617             index++;
    618         } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
    619             //-- special case "sugar-" --//
    620             result.append('X', 'S');
    621             index++;
    622         } else if (contains(value, index, 2, "SH")) {
    623             if (contains(value, index + 1, 4,
    624                          "HEIM", "HOEK", "HOLM", "HOLZ")) {
    625                 //-- germanic --//
    626                 result.append('S');
    627             } else {
    628                 result.append('X');
    629             }
    630             index += 2;
    631         } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
    632             //-- Italian and Armenian --//
    633             if (slavoGermanic) {
    634                 result.append('S');
    635             } else {
    636                 result.append('S', 'X');
    637             }
    638             index += 3;
    639         } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) {
    640             //-- german & anglicisations, e.g. "smith" match "schmidt" //
    641             // "snider" match "schneider" --//
    642             //-- also, -sz- in slavic language altho in hungarian it //
    643             //   is pronounced "s" --//
    644             result.append('S', 'X');
    645             index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
    646         } else if (contains(value, index, 2, "SC")) {
    647             index = handleSC(value, result, index);
    648         } else {
    649             if (index == value.length() - 1 && contains(value, index - 2,
    650                                                         2, "AI", "OI")){
    651                 //-- french e.g. "resnais", "artois" --//
    652                 result.appendAlternate('S');
    653             } else {
    654                 result.append('S');
    655             }
    656             index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
    657         }
    658         return index;
    659     }
    660 
    661     /**
    662      * Handles 'SC' cases
    663      */
    664     private int handleSC(String value,
    665                          DoubleMetaphoneResult result,
    666                          int index) {
    667         if (charAt(value, index + 2) == 'H') {
    668             //-- Schlesinger's rule --//
    669             if (contains(value, index + 3,
    670                          2, "OO", "ER", "EN", "UY", "ED", "EM")) {
    671                 //-- Dutch origin, e.g. "school", "schooner" --//
    672                 if (contains(value, index + 3, 2, "ER", "EN")) {
    673                     //-- "schermerhorn", "schenker" --//
    674                     result.append("X", "SK");
    675                 } else {
    676                     result.append("SK");
    677                 }
    678             } else {
    679                 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
    680                     result.append('X', 'S');
    681                 } else {
    682                     result.append('X');
    683                 }
    684             }
    685         } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
    686             result.append('S');
    687         } else {
    688             result.append("SK");
    689         }
    690         return index + 3;
    691     }
    692 
    693     /**
    694      * Handles 'T' cases
    695      */
    696     private int handleT(String value,
    697                         DoubleMetaphoneResult result,
    698                         int index) {
    699         if (contains(value, index, 4, "TION")) {
    700             result.append('X');
    701             index += 3;
    702         } else if (contains(value, index, 3, "TIA", "TCH")) {
    703             result.append('X');
    704             index += 3;
    705         } else if (contains(value, index, 2, "TH") || contains(value, index,
    706                                                                3, "TTH")) {
    707             if (contains(value, index + 2, 2, "OM", "AM") ||
    708                 //-- special case "thomas", "thames" or germanic --//
    709                 contains(value, 0, 4, "VAN ", "VON ") ||
    710                 contains(value, 0, 3, "SCH")) {
    711                 result.append('T');
    712             } else {
    713                 result.append('0', 'T');
    714             }
    715             index += 2;
    716         } else {
    717             result.append('T');
    718             index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
    719         }
    720         return index;
    721     }
    722 
    723     /**
    724      * Handles 'W' cases
    725      */
    726     private int handleW(String value,
    727                         DoubleMetaphoneResult result,
    728                         int index) {
    729         if (contains(value, index, 2, "WR")) {
    730             //-- can also be in middle of word --//
    731             result.append('R');
    732             index += 2;
    733         } else {
    734             if (index == 0 && (isVowel(charAt(value, index + 1)) ||
    735                                contains(value, index, 2, "WH"))) {
    736                 if (isVowel(charAt(value, index + 1))) {
    737                     //-- Wasserman should match Vasserman --//
    738                     result.append('A', 'F');
    739                 } else {
    740                     //-- need Uomo to match Womo --//
    741                     result.append('A');
    742                 }
    743                 index++;
    744             } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) ||
    745                        contains(value, index - 1,
    746                                 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
    747                        contains(value, 0, 3, "SCH")) {
    748                 //-- Arnow should match Arnoff --//
    749                 result.appendAlternate('F');
    750                 index++;
    751             } else if (contains(value, index, 4, "WICZ", "WITZ")) {
    752                 //-- Polish e.g. "filipowicz" --//
    753                 result.append("TS", "FX");
    754                 index += 4;
    755             } else {
    756                 index++;
    757             }
    758         }
    759         return index;
    760     }
    761 
    762     /**
    763      * Handles 'X' cases
    764      */
    765     private int handleX(String value,
    766                         DoubleMetaphoneResult result,
    767                         int index) {
    768         if (index == 0) {
    769             result.append('S');
    770             index++;
    771         } else {
    772             if (!((index == value.length() - 1) &&
    773                   (contains(value, index - 3, 3, "IAU", "EAU") ||
    774                    contains(value, index - 2, 2, "AU", "OU")))) {
    775                 //-- French e.g. breaux --//
    776                 result.append("KS");
    777             }
    778             index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
    779         }
    780         return index;
    781     }
    782 
    783     /**
    784      * Handles 'Z' cases
    785      */
    786     private int handleZ(String value, DoubleMetaphoneResult result, int index,
    787                         boolean slavoGermanic) {
    788         if (charAt(value, index + 1) == 'H') {
    789             //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
    790             result.append('J');
    791             index += 2;
    792         } else {
    793             if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
    794                 result.append("S", "TS");
    795             } else {
    796                 result.append('S');
    797             }
    798             index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
    799         }
    800         return index;
    801     }
    802 
    803     //-- BEGIN CONDITIONS --//
    804 
    805     /**
    806      * Complex condition 0 for 'C'
    807      */
    808     private boolean conditionC0(String value, int index) {
    809         if (contains(value, index, 4, "CHIA")) {
    810             return true;
    811         } else if (index <= 1) {
    812             return false;
    813         } else if (isVowel(charAt(value, index - 2))) {
    814             return false;
    815         } else if (!contains(value, index - 1, 3, "ACH")) {
    816             return false;
    817         } else {
    818             char c = charAt(value, index + 2);
    819             return (c != 'I' && c != 'E')
    820                     || contains(value, index - 2, 6, "BACHER", "MACHER");
    821         }
    822     }
    823 
    824     /**
    825      * Complex condition 0 for 'CH'
    826      */
    827     private boolean conditionCH0(String value, int index) {
    828         if (index != 0) {
    829             return false;
    830         } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
    831                    !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
    832             return false;
    833         } else if (contains(value, 0, 5, "CHORE")) {
    834             return false;
    835         } else {
    836             return true;
    837         }
    838     }
    839 
    840     /**
    841      * Complex condition 1 for 'CH'
    842      */
    843     private boolean conditionCH1(String value, int index) {
    844         return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0,
    845                                                                    3, "SCH")) ||
    846                 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
    847                 contains(value, index + 2, 1, "T", "S") ||
    848                 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
    849                  (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1)));
    850     }
    851 
    852     /**
    853      * Complex condition 0 for 'L'
    854      */
    855     private boolean conditionL0(String value, int index) {
    856         if (index == value.length() - 3 &&
    857             contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
    858             return true;
    859         } else if ((contains(value, index - 1, 2, "AS", "OS") ||
    860                     contains(value, value.length() - 1, 1, "A", "O")) &&
    861                    contains(value, index - 1, 4, "ALLE")) {
    862             return true;
    863         } else {
    864             return false;
    865         }
    866     }
    867 
    868     /**
    869      * Complex condition 0 for 'M'
    870      */
    871     private boolean conditionM0(String value, int index) {
    872         if (charAt(value, index + 1) == 'M') {
    873             return true;
    874         }
    875         return contains(value, index - 1, 3, "UMB")
    876                 && ((index + 1) == value.length() - 1 || contains(value,
    877                         index + 2, 2, "ER"));
    878     }
    879 
    880     //-- BEGIN HELPER FUNCTIONS --//
    881 
    882     /**
    883      * Determines whether or not a value is of slavo-germanic orgin. A value is
    884      * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
    885      */
    886     private boolean isSlavoGermanic(String value) {
    887         return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
    888             value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1;
    889     }
    890 
    891     /**
    892      * Determines whether or not a character is a vowel or not
    893      */
    894     private boolean isVowel(char ch) {
    895         return VOWELS.indexOf(ch) != -1;
    896     }
    897 
    898     /**
    899      * Determines whether or not the value starts with a silent letter.  It will
    900      * return <code>true</code> if the value starts with any of 'GN', 'KN',
    901      * 'PN', 'WR' or 'PS'.
    902      */
    903     private boolean isSilentStart(String value) {
    904         boolean result = false;
    905         for (int i = 0; i < SILENT_START.length; i++) {
    906             if (value.startsWith(SILENT_START[i])) {
    907                 result = true;
    908                 break;
    909             }
    910         }
    911         return result;
    912     }
    913 
    914     /**
    915      * Cleans the input
    916      */
    917     private String cleanInput(String input) {
    918         if (input == null) {
    919             return null;
    920         }
    921         input = input.trim();
    922         if (input.length() == 0) {
    923             return null;
    924         }
    925         return input.toUpperCase();
    926     }
    927 
    928     /**
    929      * Gets the character at index <code>index</code> if available, otherwise
    930      * it returns <code>Character.MIN_VALUE</code> so that there is some sort
    931      * of a default
    932      */
    933     protected char charAt(String value, int index) {
    934         if (index < 0 || index >= value.length()) {
    935             return Character.MIN_VALUE;
    936         }
    937         return value.charAt(index);
    938     }
    939 
    940     /**
    941      * Shortcut method with 1 criteria
    942      */
    943     private static boolean contains(String value, int start, int length,
    944                                     String criteria) {
    945         return contains(value, start, length,
    946                         new String[] { criteria });
    947     }
    948 
    949     /**
    950      * Shortcut method with 2 criteria
    951      */
    952     private static boolean contains(String value, int start, int length,
    953                                     String criteria1, String criteria2) {
    954         return contains(value, start, length,
    955                         new String[] { criteria1, criteria2 });
    956     }
    957 
    958     /**
    959      * Shortcut method with 3 criteria
    960      */
    961     private static boolean contains(String value, int start, int length,
    962                                     String criteria1, String criteria2,
    963                                     String criteria3) {
    964         return contains(value, start, length,
    965                         new String[] { criteria1, criteria2, criteria3 });
    966     }
    967 
    968     /**
    969      * Shortcut method with 4 criteria
    970      */
    971     private static boolean contains(String value, int start, int length,
    972                                     String criteria1, String criteria2,
    973                                     String criteria3, String criteria4) {
    974         return contains(value, start, length,
    975                         new String[] { criteria1, criteria2, criteria3,
    976                                        criteria4 });
    977     }
    978 
    979     /**
    980      * Shortcut method with 5 criteria
    981      */
    982     private static boolean contains(String value, int start, int length,
    983                                     String criteria1, String criteria2,
    984                                     String criteria3, String criteria4,
    985                                     String criteria5) {
    986         return contains(value, start, length,
    987                         new String[] { criteria1, criteria2, criteria3,
    988                                        criteria4, criteria5 });
    989     }
    990 
    991     /**
    992      * Shortcut method with 6 criteria
    993      */
    994     private static boolean contains(String value, int start, int length,
    995                                     String criteria1, String criteria2,
    996                                     String criteria3, String criteria4,
    997                                     String criteria5, String criteria6) {
    998         return contains(value, start, length,
    999                         new String[] { criteria1, criteria2, criteria3,
   1000                                        criteria4, criteria5, criteria6 });
   1001     }
   1002 
   1003     /**
   1004      * Determines whether <code>value</code> contains any of the criteria
   1005      starting
   1006      * at index <code>start</code> and matching up to length <code>length</code>
   1007      */
   1008     protected static boolean contains(String value, int start, int length,
   1009                                       String[] criteria) {
   1010         boolean result = false;
   1011         if (start >= 0 && start + length <= value.length()) {
   1012             String target = value.substring(start, start + length);
   1013 
   1014             for (int i = 0; i < criteria.length; i++) {
   1015                 if (target.equals(criteria[i])) {
   1016                     result = true;
   1017                     break;
   1018                 }
   1019             }
   1020         }
   1021         return result;
   1022     }
   1023 
   1024     //-- BEGIN INNER CLASSES --//
   1025 
   1026     /**
   1027      * Inner class for storing results, since there is the optional alternate
   1028      * encoding.
   1029      */
   1030     public class DoubleMetaphoneResult {
   1031 
   1032         private StringBuffer primary = new StringBuffer(getMaxCodeLen());
   1033         private StringBuffer alternate = new StringBuffer(getMaxCodeLen());
   1034         private int maxLength;
   1035 
   1036         public DoubleMetaphoneResult(int maxLength) {
   1037             this.maxLength = maxLength;
   1038         }
   1039 
   1040         public void append(char value) {
   1041             appendPrimary(value);
   1042             appendAlternate(value);
   1043         }
   1044 
   1045         public void append(char primary, char alternate) {
   1046             appendPrimary(primary);
   1047             appendAlternate(alternate);
   1048         }
   1049 
   1050         public void appendPrimary(char value) {
   1051             if (this.primary.length() < this.maxLength) {
   1052                 this.primary.append(value);
   1053             }
   1054         }
   1055 
   1056         public void appendAlternate(char value) {
   1057             if (this.alternate.length() < this.maxLength) {
   1058                 this.alternate.append(value);
   1059             }
   1060         }
   1061 
   1062         public void append(String value) {
   1063             appendPrimary(value);
   1064             appendAlternate(value);
   1065         }
   1066 
   1067         public void append(String primary, String alternate) {
   1068             appendPrimary(primary);
   1069             appendAlternate(alternate);
   1070         }
   1071 
   1072         public void appendPrimary(String value) {
   1073             int addChars = this.maxLength - this.primary.length();
   1074             if (value.length() <= addChars) {
   1075                 this.primary.append(value);
   1076             } else {
   1077                 this.primary.append(value.substring(0, addChars));
   1078             }
   1079         }
   1080 
   1081         public void appendAlternate(String value) {
   1082             int addChars = this.maxLength - this.alternate.length();
   1083             if (value.length() <= addChars) {
   1084                 this.alternate.append(value);
   1085             } else {
   1086                 this.alternate.append(value.substring(0, addChars));
   1087             }
   1088         }
   1089 
   1090         public String getPrimary() {
   1091             return this.primary.toString();
   1092         }
   1093 
   1094         public String getAlternate() {
   1095             return this.alternate.toString();
   1096         }
   1097 
   1098         public boolean isComplete() {
   1099             return this.primary.length() >= this.maxLength &&
   1100                 this.alternate.length() >= this.maxLength;
   1101         }
   1102     }
   1103 }
   1104