Home | History | Annotate | Download | only in language
      1 /*
      2  * Copyright 2001-2004 The Apache Software Foundation.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package org.apache.commons.codec.language;
     18 
     19 import org.apache.commons.codec.EncoderException;
     20 import org.apache.commons.codec.StringEncoder;
     21 
     22 /**
     23  * Encodes a string into a double metaphone value.
     24  * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>.
     25  * <ul>
     26  * <li>Original Article: <a
     27  * href="http://www.cuj.com/documents/s=8038/cuj0006philips/">
     28  * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li>
     29  * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip">
     30  * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li>
     31  * </ul>
     32  *
     33  * @author Apache Software Foundation
     34  * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $
     35  *
     36  * @deprecated Please use {@link java.net.URL#openConnection} instead.
     37  *     Please visit <a href="http://android-developers.blogspot.com/2011/09/androids-http-clients.html">this webpage</a>
     38  *     for further details.
     39  */
     40 @Deprecated
     41 public class DoubleMetaphone implements StringEncoder {
     42 
     43     /**
     44      * "Vowels" to test for
     45      */
     46     private static final String VOWELS = "AEIOUY";
     47 
     48     /**
     49      * Prefixes when present which are not pronounced
     50      */
     51     private static final String[] SILENT_START =
     52     { "GN", "KN", "PN", "WR", "PS" };
     53     private static final String[] L_R_N_M_B_H_F_V_W_SPACE =
     54     { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
     55     private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
     56     { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
     57     private static final String[] L_T_K_S_N_M_B_Z =
     58     { "L", "T", "K", "S", "N", "M", "B", "Z" };
     59 
     60     /**
     61      * Maximum length of an encoding, default is 4
     62      */
     63     protected int maxCodeLen = 4;
     64 
     65     /**
     66      * Creates an instance of this DoubleMetaphone encoder
     67      */
     68     public DoubleMetaphone() {
     69         super();
     70     }
     71 
     72     /**
     73      * Encode a value with Double Metaphone
     74      *
     75      * @param value String to encode
     76      * @return an encoded string
     77      */
     78     public String doubleMetaphone(String value) {
     79         return doubleMetaphone(value, false);
     80     }
     81 
     82     /**
     83      * Encode a value with Double Metaphone, optionally using the alternate
     84      * encoding.
     85      *
     86      * @param value String to encode
     87      * @param alternate use alternate encode
     88      * @return an encoded string
     89      */
     90     public String doubleMetaphone(String value, boolean alternate) {
     91         value = cleanInput(value);
     92         if (value == null) {
     93             return null;
     94         }
     95 
     96         boolean slavoGermanic = isSlavoGermanic(value);
     97         int index = isSilentStart(value) ? 1 : 0;
     98 
     99         DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen());
    100 
    101         while (!result.isComplete() && index <= value.length() - 1) {
    102             switch (value.charAt(index)) {
    103             case 'A':
    104             case 'E':
    105             case 'I':
    106             case 'O':
    107             case 'U':
    108             case 'Y':
    109                 index = handleAEIOUY(value, result, index);
    110                 break;
    111             case 'B':
    112                 result.append('P');
    113                 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
    114                 break;
    115             case '\u00C7':
    116                 // A C with a Cedilla
    117                 result.append('S');
    118                 index++;
    119                 break;
    120             case 'C':
    121                 index = handleC(value, result, index);
    122                 break;
    123             case 'D':
    124                 index = handleD(value, result, index);
    125                 break;
    126             case 'F':
    127                 result.append('F');
    128                 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
    129                 break;
    130             case 'G':
    131                 index = handleG(value, result, index, slavoGermanic);
    132                 break;
    133             case 'H':
    134                 index = handleH(value, result, index);
    135                 break;
    136             case 'J':
    137                 index = handleJ(value, result, index, slavoGermanic);
    138                 break;
    139             case 'K':
    140                 result.append('K');
    141                 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
    142                 break;
    143             case 'L':
    144                 index = handleL(value, result, index);
    145                 break;
    146             case 'M':
    147                 result.append('M');
    148                 index = conditionM0(value, index) ? index + 2 : index + 1;
    149                 break;
    150             case 'N':
    151                 result.append('N');
    152                 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
    153                 break;
    154             case '\u00D1':
    155                 // N with a tilde (spanish ene)
    156                 result.append('N');
    157                 index++;
    158                 break;
    159             case 'P':
    160                 index = handleP(value, result, index);
    161                 break;
    162             case 'Q':
    163                 result.append('K');
    164                 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
    165                 break;
    166             case 'R':
    167                 index = handleR(value, result, index, slavoGermanic);
    168                 break;
    169             case 'S':
    170                 index = handleS(value, result, index, slavoGermanic);
    171                 break;
    172             case 'T':
    173                 index = handleT(value, result, index);
    174                 break;
    175             case 'V':
    176                 result.append('F');
    177                 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
    178                 break;
    179             case 'W':
    180                 index = handleW(value, result, index);
    181                 break;
    182             case 'X':
    183                 index = handleX(value, result, index);
    184                 break;
    185             case 'Z':
    186                 index = handleZ(value, result, index, slavoGermanic);
    187                 break;
    188             default:
    189                 index++;
    190                 break;
    191             }
    192         }
    193 
    194         return alternate ? result.getAlternate() : result.getPrimary();
    195     }
    196 
    197     /**
    198      * Encode the value using DoubleMetaphone.  It will only work if
    199      * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
    200      *
    201      * @param obj Object to encode (should be of type String)
    202      * @return An encoded Object (will be of type String)
    203      * @throws EncoderException encode parameter is not of type String
    204      */
    205     public Object encode(Object obj) throws EncoderException {
    206         if (!(obj instanceof String)) {
    207             throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
    208         }
    209         return doubleMetaphone((String) obj);
    210     }
    211 
    212     /**
    213      * Encode the value using DoubleMetaphone.
    214      *
    215      * @param value String to encode
    216      * @return An encoded String
    217      */
    218     public String encode(String value) {
    219         return doubleMetaphone(value);
    220     }
    221 
    222     /**
    223      * Check if the Double Metaphone values of two <code>String</code> values
    224      * are equal.
    225      *
    226      * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
    227      * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
    228      * @return <code>true</code> if the encoded <code>String</code>s are equal;
    229      *          <code>false</code> otherwise.
    230      * @see #isDoubleMetaphoneEqual(String,String,boolean)
    231      */
    232     public boolean isDoubleMetaphoneEqual(String value1, String value2) {
    233         return isDoubleMetaphoneEqual(value1, value2, false);
    234     }
    235 
    236     /**
    237      * Check if the Double Metaphone values of two <code>String</code> values
    238      * are equal, optionally using the alternate value.
    239      *
    240      * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
    241      * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
    242      * @param alternate use the alternate value if <code>true</code>.
    243      * @return <code>true</code> if the encoded <code>String</code>s are equal;
    244      *          <code>false</code> otherwise.
    245      */
    246     public boolean isDoubleMetaphoneEqual(String value1,
    247                                           String value2,
    248                                           boolean alternate) {
    249         return doubleMetaphone(value1, alternate).equals(doubleMetaphone
    250                                                          (value2, alternate));
    251     }
    252 
    253     /**
    254      * Returns the maxCodeLen.
    255      * @return int
    256      */
    257     public int getMaxCodeLen() {
    258         return this.maxCodeLen;
    259     }
    260 
    261     /**
    262      * Sets the maxCodeLen.
    263      * @param maxCodeLen The maxCodeLen to set
    264      */
    265     public void setMaxCodeLen(int maxCodeLen) {
    266         this.maxCodeLen = maxCodeLen;
    267     }
    268 
    269     //-- BEGIN HANDLERS --//
    270 
    271     /**
    272      * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases
    273      */
    274     private int handleAEIOUY(String value, DoubleMetaphoneResult result, int
    275                              index) {
    276         if (index == 0) {
    277             result.append('A');
    278         }
    279         return index + 1;
    280     }
    281 
    282     /**
    283      * Handles 'C' cases
    284      */
    285     private int handleC(String value,
    286                         DoubleMetaphoneResult result,
    287                         int index) {
    288         if (conditionC0(value, index)) {  // very confusing, moved out
    289             result.append('K');
    290             index += 2;
    291         } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
    292             result.append('S');
    293             index += 2;
    294         } else if (contains(value, index, 2, "CH")) {
    295             index = handleCH(value, result, index);
    296         } else if (contains(value, index, 2, "CZ") &&
    297                    !contains(value, index - 2, 4, "WICZ")) {
    298             //-- "Czerny" --//
    299             result.append('S', 'X');
    300             index += 2;
    301         } else if (contains(value, index + 1, 3, "CIA")) {
    302             //-- "focaccia" --//
    303             result.append('X');
    304             index += 3;
    305         } else if (contains(value, index, 2, "CC") &&
    306                    !(index == 1 && charAt(value, 0) == 'M')) {
    307             //-- double "cc" but not "McClelland" --//
    308             return handleCC(value, result, index);
    309         } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
    310             result.append('K');
    311             index += 2;
    312         } else if (contains(value, index, 2, "CI", "CE", "CY")) {
    313             //-- Italian vs. English --//
    314             if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
    315                 result.append('S', 'X');
    316             } else {
    317                 result.append('S');
    318             }
    319             index += 2;
    320         } else {
    321             result.append('K');
    322             if (contains(value, index + 1, 2, " C", " Q", " G")) {
    323                 //-- Mac Caffrey, Mac Gregor --//
    324                 index += 3;
    325             } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
    326                        !contains(value, index + 1, 2, "CE", "CI")) {
    327                 index += 2;
    328             } else {
    329                 index++;
    330             }
    331         }
    332 
    333         return index;
    334     }
    335 
    336     /**
    337      * Handles 'CC' cases
    338      */
    339     private int handleCC(String value,
    340                          DoubleMetaphoneResult result,
    341                          int index) {
    342         if (contains(value, index + 2, 1, "I", "E", "H") &&
    343             !contains(value, index + 2, 2, "HU")) {
    344             //-- "bellocchio" but not "bacchus" --//
    345             if ((index == 1 && charAt(value, index - 1) == 'A') ||
    346                 contains(value, index - 1, 5, "UCCEE", "UCCES")) {
    347                 //-- "accident", "accede", "succeed" --//
    348                 result.append("KS");
    349             } else {
    350                 //-- "bacci", "bertucci", other Italian --//
    351                 result.append('X');
    352             }
    353             index += 3;
    354         } else {    // Pierce's rule
    355             result.append('K');
    356             index += 2;
    357         }
    358 
    359         return index;
    360     }
    361 
    362     /**
    363      * Handles 'CH' cases
    364      */
    365     private int handleCH(String value,
    366                          DoubleMetaphoneResult result,
    367                          int index) {
    368         if (index > 0 && contains(value, index, 4, "CHAE")) {   // Michael
    369             result.append('K', 'X');
    370             return index + 2;
    371         } else if (conditionCH0(value, index)) {
    372             //-- Greek roots ("chemistry", "chorus", etc.) --//
    373             result.append('K');
    374             return index + 2;
    375         } else if (conditionCH1(value, index)) {
    376             //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
    377             result.append('K');
    378             return index + 2;
    379         } else {
    380             if (index > 0) {
    381                 if (contains(value, 0, 2, "MC")) {
    382                     result.append('K');
    383                 } else {
    384                     result.append('X', 'K');
    385                 }
    386             } else {
    387                 result.append('X');
    388             }
    389             return index + 2;
    390         }
    391     }
    392 
    393     /**
    394      * Handles 'D' cases
    395      */
    396     private int handleD(String value,
    397                         DoubleMetaphoneResult result,
    398                         int index) {
    399         if (contains(value, index, 2, "DG")) {
    400             //-- "Edge" --//
    401             if (contains(value, index + 2, 1, "I", "E", "Y")) {
    402                 result.append('J');
    403                 index += 3;
    404                 //-- "Edgar" --//
    405             } else {
    406                 result.append("TK");
    407                 index += 2;
    408             }
    409         } else if (contains(value, index, 2, "DT", "DD")) {
    410             result.append('T');
    411             index += 2;
    412         } else {
    413             result.append('T');
    414             index++;
    415         }
    416         return index;
    417     }
    418 
    419     /**
    420      * Handles 'G' cases
    421      */
    422     private int handleG(String value,
    423                         DoubleMetaphoneResult result,
    424                         int index,
    425                         boolean slavoGermanic) {
    426         if (charAt(value, index + 1) == 'H') {
    427             index = handleGH(value, result, index);
    428         } else if (charAt(value, index + 1) == 'N') {
    429             if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
    430                 result.append("KN", "N");
    431             } else if (!contains(value, index + 2, 2, "EY") &&
    432                        charAt(value, index + 1) != 'Y' && !slavoGermanic) {
    433                 result.append("N", "KN");
    434             } else {
    435                 result.append("KN");
    436             }
    437             index = index + 2;
    438         } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
    439             result.append("KL", "L");
    440             index += 2;
    441         } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
    442             //-- -ges-, -gep-, -gel-, -gie- at beginning --//
    443             result.append('K', 'J');
    444             index += 2;
    445         } else if ((contains(value, index + 1, 2, "ER") ||
    446                     charAt(value, index + 1) == 'Y') &&
    447                    !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
    448                    !contains(value, index - 1, 1, "E", "I") &&
    449                    !contains(value, index - 1, 3, "RGY", "OGY")) {
    450             //-- -ger-, -gy- --//
    451             result.append('K', 'J');
    452             index += 2;
    453         } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
    454                    contains(value, index - 1, 4, "AGGI", "OGGI")) {
    455             //-- Italian "biaggi" --//
    456             if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) {
    457                 //-- obvious germanic --//
    458                 result.append('K');
    459             } else if (contains(value, index + 1, 4, "IER")) {
    460                 result.append('J');
    461             } else {
    462                 result.append('J', 'K');
    463             }
    464             index += 2;
    465         } else if (charAt(value, index + 1) == 'G') {
    466             index += 2;
    467             result.append('K');
    468         } else {
    469             index++;
    470             result.append('K');
    471         }
    472         return index;
    473     }
    474 
    475     /**
    476      * Handles 'GH' cases
    477      */
    478     private int handleGH(String value,
    479                          DoubleMetaphoneResult result,
    480                          int index) {
    481         if (index > 0 && !isVowel(charAt(value, index - 1))) {
    482             result.append('K');
    483             index += 2;
    484         } else if (index == 0) {
    485             if (charAt(value, index + 2) == 'I') {
    486                 result.append('J');
    487             } else {
    488                 result.append('K');
    489             }
    490             index += 2;
    491         } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) ||
    492                    (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) ||
    493                    (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
    494             //-- Parker's rule (with some further refinements) - "hugh"
    495             index += 2;
    496         } else {
    497             if (index > 2 && charAt(value, index - 1) == 'U' &&
    498                 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
    499                 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
    500                 result.append('F');
    501             } else if (index > 0 && charAt(value, index - 1) != 'I') {
    502                 result.append('K');
    503             }
    504             index += 2;
    505         }
    506         return index;
    507     }
    508 
    509     /**
    510      * Handles 'H' cases
    511      */
    512     private int handleH(String value,
    513                         DoubleMetaphoneResult result,
    514                         int index) {
    515         //-- only keep if first & before vowel or between 2 vowels --//
    516         if ((index == 0 || isVowel(charAt(value, index - 1))) &&
    517             isVowel(charAt(value, index + 1))) {
    518             result.append('H');
    519             index += 2;
    520             //-- also takes car of "HH" --//
    521         } else {
    522             index++;
    523         }
    524         return index;
    525     }
    526 
    527     /**
    528      * Handles 'J' cases
    529      */
    530     private int handleJ(String value, DoubleMetaphoneResult result, int index,
    531                         boolean slavoGermanic) {
    532         if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
    533                 //-- obvious Spanish, "Jose", "San Jacinto" --//
    534                 if ((index == 0 && (charAt(value, index + 4) == ' ') ||
    535                      value.length() == 4) || contains(value, 0, 4, "SAN ")) {
    536                     result.append('H');
    537                 } else {
    538                     result.append('J', 'H');
    539                 }
    540                 index++;
    541             } else {
    542                 if (index == 0 && !contains(value, index, 4, "JOSE")) {
    543                     result.append('J', 'A');
    544                 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
    545                               (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
    546                     result.append('J', 'H');
    547                 } else if (index == value.length() - 1) {
    548                     result.append('J', ' ');
    549                 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) {
    550                     result.append('J');
    551                 }
    552 
    553                 if (charAt(value, index + 1) == 'J') {
    554                     index += 2;
    555                 } else {
    556                     index++;
    557                 }
    558             }
    559         return index;
    560     }
    561 
    562     /**
    563      * Handles 'L' cases
    564      */
    565     private int handleL(String value,
    566                         DoubleMetaphoneResult result,
    567                         int index) {
    568         result.append('L');
    569         if (charAt(value, index + 1) == 'L') {
    570             if (conditionL0(value, index)) {
    571                 result.appendAlternate(' ');
    572             }
    573             index += 2;
    574         } else {
    575             index++;
    576         }
    577         return index;
    578     }
    579 
    580     /**
    581      * Handles 'P' cases
    582      */
    583     private int handleP(String value,
    584                         DoubleMetaphoneResult result,
    585                         int index) {
    586         if (charAt(value, index + 1) == 'H') {
    587             result.append('F');
    588             index += 2;
    589         } else {
    590             result.append('P');
    591             index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
    592         }
    593         return index;
    594     }
    595 
    596     /**
    597      * Handles 'R' cases
    598      */
    599     private int handleR(String value,
    600                         DoubleMetaphoneResult result,
    601                         int index,
    602                         boolean slavoGermanic) {
    603         if (index == value.length() - 1 && !slavoGermanic &&
    604             contains(value, index - 2, 2, "IE") &&
    605             !contains(value, index - 4, 2, "ME", "MA")) {
    606             result.appendAlternate('R');
    607         } else {
    608             result.append('R');
    609         }
    610         return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
    611     }
    612 
    613     /**
    614      * Handles 'S' cases
    615      */
    616     private int handleS(String value,
    617                         DoubleMetaphoneResult result,
    618                         int index,
    619                         boolean slavoGermanic) {
    620         if (contains(value, index - 1, 3, "ISL", "YSL")) {
    621             //-- special cases "island", "isle", "carlisle", "carlysle" --//
    622             index++;
    623         } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
    624             //-- special case "sugar-" --//
    625             result.append('X', 'S');
    626             index++;
    627         } else if (contains(value, index, 2, "SH")) {
    628             if (contains(value, index + 1, 4,
    629                          "HEIM", "HOEK", "HOLM", "HOLZ")) {
    630                 //-- germanic --//
    631                 result.append('S');
    632             } else {
    633                 result.append('X');
    634             }
    635             index += 2;
    636         } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
    637             //-- Italian and Armenian --//
    638             if (slavoGermanic) {
    639                 result.append('S');
    640             } else {
    641                 result.append('S', 'X');
    642             }
    643             index += 3;
    644         } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) {
    645             //-- german & anglicisations, e.g. "smith" match "schmidt" //
    646             // "snider" match "schneider" --//
    647             //-- also, -sz- in slavic language altho in hungarian it //
    648             //   is pronounced "s" --//
    649             result.append('S', 'X');
    650             index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
    651         } else if (contains(value, index, 2, "SC")) {
    652             index = handleSC(value, result, index);
    653         } else {
    654             if (index == value.length() - 1 && contains(value, index - 2,
    655                                                         2, "AI", "OI")){
    656                 //-- french e.g. "resnais", "artois" --//
    657                 result.appendAlternate('S');
    658             } else {
    659                 result.append('S');
    660             }
    661             index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
    662         }
    663         return index;
    664     }
    665 
    666     /**
    667      * Handles 'SC' cases
    668      */
    669     private int handleSC(String value,
    670                          DoubleMetaphoneResult result,
    671                          int index) {
    672         if (charAt(value, index + 2) == 'H') {
    673             //-- Schlesinger's rule --//
    674             if (contains(value, index + 3,
    675                          2, "OO", "ER", "EN", "UY", "ED", "EM")) {
    676                 //-- Dutch origin, e.g. "school", "schooner" --//
    677                 if (contains(value, index + 3, 2, "ER", "EN")) {
    678                     //-- "schermerhorn", "schenker" --//
    679                     result.append("X", "SK");
    680                 } else {
    681                     result.append("SK");
    682                 }
    683             } else {
    684                 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
    685                     result.append('X', 'S');
    686                 } else {
    687                     result.append('X');
    688                 }
    689             }
    690         } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
    691             result.append('S');
    692         } else {
    693             result.append("SK");
    694         }
    695         return index + 3;
    696     }
    697 
    698     /**
    699      * Handles 'T' cases
    700      */
    701     private int handleT(String value,
    702                         DoubleMetaphoneResult result,
    703                         int index) {
    704         if (contains(value, index, 4, "TION")) {
    705             result.append('X');
    706             index += 3;
    707         } else if (contains(value, index, 3, "TIA", "TCH")) {
    708             result.append('X');
    709             index += 3;
    710         } else if (contains(value, index, 2, "TH") || contains(value, index,
    711                                                                3, "TTH")) {
    712             if (contains(value, index + 2, 2, "OM", "AM") ||
    713                 //-- special case "thomas", "thames" or germanic --//
    714                 contains(value, 0, 4, "VAN ", "VON ") ||
    715                 contains(value, 0, 3, "SCH")) {
    716                 result.append('T');
    717             } else {
    718                 result.append('0', 'T');
    719             }
    720             index += 2;
    721         } else {
    722             result.append('T');
    723             index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
    724         }
    725         return index;
    726     }
    727 
    728     /**
    729      * Handles 'W' cases
    730      */
    731     private int handleW(String value,
    732                         DoubleMetaphoneResult result,
    733                         int index) {
    734         if (contains(value, index, 2, "WR")) {
    735             //-- can also be in middle of word --//
    736             result.append('R');
    737             index += 2;
    738         } else {
    739             if (index == 0 && (isVowel(charAt(value, index + 1)) ||
    740                                contains(value, index, 2, "WH"))) {
    741                 if (isVowel(charAt(value, index + 1))) {
    742                     //-- Wasserman should match Vasserman --//
    743                     result.append('A', 'F');
    744                 } else {
    745                     //-- need Uomo to match Womo --//
    746                     result.append('A');
    747                 }
    748                 index++;
    749             } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) ||
    750                        contains(value, index - 1,
    751                                 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
    752                        contains(value, 0, 3, "SCH")) {
    753                 //-- Arnow should match Arnoff --//
    754                 result.appendAlternate('F');
    755                 index++;
    756             } else if (contains(value, index, 4, "WICZ", "WITZ")) {
    757                 //-- Polish e.g. "filipowicz" --//
    758                 result.append("TS", "FX");
    759                 index += 4;
    760             } else {
    761                 index++;
    762             }
    763         }
    764         return index;
    765     }
    766 
    767     /**
    768      * Handles 'X' cases
    769      */
    770     private int handleX(String value,
    771                         DoubleMetaphoneResult result,
    772                         int index) {
    773         if (index == 0) {
    774             result.append('S');
    775             index++;
    776         } else {
    777             if (!((index == value.length() - 1) &&
    778                   (contains(value, index - 3, 3, "IAU", "EAU") ||
    779                    contains(value, index - 2, 2, "AU", "OU")))) {
    780                 //-- French e.g. breaux --//
    781                 result.append("KS");
    782             }
    783             index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
    784         }
    785         return index;
    786     }
    787 
    788     /**
    789      * Handles 'Z' cases
    790      */
    791     private int handleZ(String value, DoubleMetaphoneResult result, int index,
    792                         boolean slavoGermanic) {
    793         if (charAt(value, index + 1) == 'H') {
    794             //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
    795             result.append('J');
    796             index += 2;
    797         } else {
    798             if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
    799                 result.append("S", "TS");
    800             } else {
    801                 result.append('S');
    802             }
    803             index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
    804         }
    805         return index;
    806     }
    807 
    808     //-- BEGIN CONDITIONS --//
    809 
    810     /**
    811      * Complex condition 0 for 'C'
    812      */
    813     private boolean conditionC0(String value, int index) {
    814         if (contains(value, index, 4, "CHIA")) {
    815             return true;
    816         } else if (index <= 1) {
    817             return false;
    818         } else if (isVowel(charAt(value, index - 2))) {
    819             return false;
    820         } else if (!contains(value, index - 1, 3, "ACH")) {
    821             return false;
    822         } else {
    823             char c = charAt(value, index + 2);
    824             return (c != 'I' && c != 'E')
    825                     || contains(value, index - 2, 6, "BACHER", "MACHER");
    826         }
    827     }
    828 
    829     /**
    830      * Complex condition 0 for 'CH'
    831      */
    832     private boolean conditionCH0(String value, int index) {
    833         if (index != 0) {
    834             return false;
    835         } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
    836                    !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
    837             return false;
    838         } else if (contains(value, 0, 5, "CHORE")) {
    839             return false;
    840         } else {
    841             return true;
    842         }
    843     }
    844 
    845     /**
    846      * Complex condition 1 for 'CH'
    847      */
    848     private boolean conditionCH1(String value, int index) {
    849         return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0,
    850                                                                    3, "SCH")) ||
    851                 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
    852                 contains(value, index + 2, 1, "T", "S") ||
    853                 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
    854                  (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1)));
    855     }
    856 
    857     /**
    858      * Complex condition 0 for 'L'
    859      */
    860     private boolean conditionL0(String value, int index) {
    861         if (index == value.length() - 3 &&
    862             contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
    863             return true;
    864         } else if ((contains(value, index - 1, 2, "AS", "OS") ||
    865                     contains(value, value.length() - 1, 1, "A", "O")) &&
    866                    contains(value, index - 1, 4, "ALLE")) {
    867             return true;
    868         } else {
    869             return false;
    870         }
    871     }
    872 
    873     /**
    874      * Complex condition 0 for 'M'
    875      */
    876     private boolean conditionM0(String value, int index) {
    877         if (charAt(value, index + 1) == 'M') {
    878             return true;
    879         }
    880         return contains(value, index - 1, 3, "UMB")
    881                 && ((index + 1) == value.length() - 1 || contains(value,
    882                         index + 2, 2, "ER"));
    883     }
    884 
    885     //-- BEGIN HELPER FUNCTIONS --//
    886 
    887     /**
    888      * Determines whether or not a value is of slavo-germanic orgin. A value is
    889      * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
    890      */
    891     private boolean isSlavoGermanic(String value) {
    892         return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
    893             value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1;
    894     }
    895 
    896     /**
    897      * Determines whether or not a character is a vowel or not
    898      */
    899     private boolean isVowel(char ch) {
    900         return VOWELS.indexOf(ch) != -1;
    901     }
    902 
    903     /**
    904      * Determines whether or not the value starts with a silent letter.  It will
    905      * return <code>true</code> if the value starts with any of 'GN', 'KN',
    906      * 'PN', 'WR' or 'PS'.
    907      */
    908     private boolean isSilentStart(String value) {
    909         boolean result = false;
    910         for (int i = 0; i < SILENT_START.length; i++) {
    911             if (value.startsWith(SILENT_START[i])) {
    912                 result = true;
    913                 break;
    914             }
    915         }
    916         return result;
    917     }
    918 
    919     /**
    920      * Cleans the input
    921      */
    922     private String cleanInput(String input) {
    923         if (input == null) {
    924             return null;
    925         }
    926         input = input.trim();
    927         if (input.length() == 0) {
    928             return null;
    929         }
    930         return input.toUpperCase();
    931     }
    932 
    933     /**
    934      * Gets the character at index <code>index</code> if available, otherwise
    935      * it returns <code>Character.MIN_VALUE</code> so that there is some sort
    936      * of a default
    937      */
    938     protected char charAt(String value, int index) {
    939         if (index < 0 || index >= value.length()) {
    940             return Character.MIN_VALUE;
    941         }
    942         return value.charAt(index);
    943     }
    944 
    945     /**
    946      * Shortcut method with 1 criteria
    947      */
    948     private static boolean contains(String value, int start, int length,
    949                                     String criteria) {
    950         return contains(value, start, length,
    951                         new String[] { criteria });
    952     }
    953 
    954     /**
    955      * Shortcut method with 2 criteria
    956      */
    957     private static boolean contains(String value, int start, int length,
    958                                     String criteria1, String criteria2) {
    959         return contains(value, start, length,
    960                         new String[] { criteria1, criteria2 });
    961     }
    962 
    963     /**
    964      * Shortcut method with 3 criteria
    965      */
    966     private static boolean contains(String value, int start, int length,
    967                                     String criteria1, String criteria2,
    968                                     String criteria3) {
    969         return contains(value, start, length,
    970                         new String[] { criteria1, criteria2, criteria3 });
    971     }
    972 
    973     /**
    974      * Shortcut method with 4 criteria
    975      */
    976     private static boolean contains(String value, int start, int length,
    977                                     String criteria1, String criteria2,
    978                                     String criteria3, String criteria4) {
    979         return contains(value, start, length,
    980                         new String[] { criteria1, criteria2, criteria3,
    981                                        criteria4 });
    982     }
    983 
    984     /**
    985      * Shortcut method with 5 criteria
    986      */
    987     private static boolean contains(String value, int start, int length,
    988                                     String criteria1, String criteria2,
    989                                     String criteria3, String criteria4,
    990                                     String criteria5) {
    991         return contains(value, start, length,
    992                         new String[] { criteria1, criteria2, criteria3,
    993                                        criteria4, criteria5 });
    994     }
    995 
    996     /**
    997      * Shortcut method with 6 criteria
    998      */
    999     private static boolean contains(String value, int start, int length,
   1000                                     String criteria1, String criteria2,
   1001                                     String criteria3, String criteria4,
   1002                                     String criteria5, String criteria6) {
   1003         return contains(value, start, length,
   1004                         new String[] { criteria1, criteria2, criteria3,
   1005                                        criteria4, criteria5, criteria6 });
   1006     }
   1007 
   1008     /**
   1009      * Determines whether <code>value</code> contains any of the criteria
   1010      starting
   1011      * at index <code>start</code> and matching up to length <code>length</code>
   1012      */
   1013     protected static boolean contains(String value, int start, int length,
   1014                                       String[] criteria) {
   1015         boolean result = false;
   1016         if (start >= 0 && start + length <= value.length()) {
   1017             String target = value.substring(start, start + length);
   1018 
   1019             for (int i = 0; i < criteria.length; i++) {
   1020                 if (target.equals(criteria[i])) {
   1021                     result = true;
   1022                     break;
   1023                 }
   1024             }
   1025         }
   1026         return result;
   1027     }
   1028 
   1029     //-- BEGIN INNER CLASSES --//
   1030 
   1031     /**
   1032      * Inner class for storing results, since there is the optional alternate
   1033      * encoding.
   1034      */
   1035     public class DoubleMetaphoneResult {
   1036 
   1037         private StringBuffer primary = new StringBuffer(getMaxCodeLen());
   1038         private StringBuffer alternate = new StringBuffer(getMaxCodeLen());
   1039         private int maxLength;
   1040 
   1041         public DoubleMetaphoneResult(int maxLength) {
   1042             this.maxLength = maxLength;
   1043         }
   1044 
   1045         public void append(char value) {
   1046             appendPrimary(value);
   1047             appendAlternate(value);
   1048         }
   1049 
   1050         public void append(char primary, char alternate) {
   1051             appendPrimary(primary);
   1052             appendAlternate(alternate);
   1053         }
   1054 
   1055         public void appendPrimary(char value) {
   1056             if (this.primary.length() < this.maxLength) {
   1057                 this.primary.append(value);
   1058             }
   1059         }
   1060 
   1061         public void appendAlternate(char value) {
   1062             if (this.alternate.length() < this.maxLength) {
   1063                 this.alternate.append(value);
   1064             }
   1065         }
   1066 
   1067         public void append(String value) {
   1068             appendPrimary(value);
   1069             appendAlternate(value);
   1070         }
   1071 
   1072         public void append(String primary, String alternate) {
   1073             appendPrimary(primary);
   1074             appendAlternate(alternate);
   1075         }
   1076 
   1077         public void appendPrimary(String value) {
   1078             int addChars = this.maxLength - this.primary.length();
   1079             if (value.length() <= addChars) {
   1080                 this.primary.append(value);
   1081             } else {
   1082                 this.primary.append(value.substring(0, addChars));
   1083             }
   1084         }
   1085 
   1086         public void appendAlternate(String value) {
   1087             int addChars = this.maxLength - this.alternate.length();
   1088             if (value.length() <= addChars) {
   1089                 this.alternate.append(value);
   1090             } else {
   1091                 this.alternate.append(value.substring(0, addChars));
   1092             }
   1093         }
   1094 
   1095         public String getPrimary() {
   1096             return this.primary.toString();
   1097         }
   1098 
   1099         public String getAlternate() {
   1100             return this.alternate.toString();
   1101         }
   1102 
   1103         public boolean isComplete() {
   1104             return this.primary.length() >= this.maxLength &&
   1105                 this.alternate.length() >= this.maxLength;
   1106         }
   1107     }
   1108 }
   1109