Home | History | Annotate | Download | only in language
      1 /*
      2  * Copyright 2001-2004 The Apache Software Foundation.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package org.apache.commons.codec.language;
     18 
     19 import org.apache.commons.codec.EncoderException;
     20 import org.apache.commons.codec.StringEncoder;
     21 
     22 /**
     23  * Encodes a string into a metaphone value.
     24  * <p>
     25  * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
     26  * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
     27  * </p>
     28  * <p>
     29  * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
     30  * 39.</CITE>
     31  * </p>
     32  *
     33  * @author Apache Software Foundation
     34  * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $
     35  */
     36 public class Metaphone implements StringEncoder {
     37 
     38     /**
     39      * Five values in the English language
     40      */
     41     private String vowels = "AEIOU" ;
     42 
     43     /**
     44      * Variable used in Metaphone algorithm
     45      */
     46     private String frontv = "EIY"   ;
     47 
     48     /**
     49      * Variable used in Metaphone algorithm
     50      */
     51     private String varson = "CSPTG" ;
     52 
     53     /**
     54      * The max code length for metaphone is 4
     55      */
     56     private int maxCodeLen = 4 ;
     57 
     58     /**
     59      * Creates an instance of the Metaphone encoder
     60      */
     61     public Metaphone() {
     62         super();
     63     }
     64 
     65     /**
     66      * Find the metaphone value of a String. This is similar to the
     67      * soundex algorithm, but better at finding similar sounding words.
     68      * All input is converted to upper case.
     69      * Limitations: Input format is expected to be a single ASCII word
     70      * with only characters in the A - Z range, no punctuation or numbers.
     71      *
     72      * @param txt String to find the metaphone code for
     73      * @return A metaphone code corresponding to the String supplied
     74      */
     75     public String metaphone(String txt) {
     76         boolean hard = false ;
     77         if ((txt == null) || (txt.length() == 0)) {
     78             return "" ;
     79         }
     80         // single character is itself
     81         if (txt.length() == 1) {
     82             return txt.toUpperCase() ;
     83         }
     84 
     85         char[] inwd = txt.toUpperCase().toCharArray() ;
     86 
     87         StringBuffer local = new StringBuffer(40); // manipulate
     88         StringBuffer code = new StringBuffer(10) ; //   output
     89         // handle initial 2 characters exceptions
     90         switch(inwd[0]) {
     91         case 'K' :
     92         case 'G' :
     93         case 'P' : /* looking for KN, etc*/
     94             if (inwd[1] == 'N') {
     95                 local.append(inwd, 1, inwd.length - 1);
     96             } else {
     97                 local.append(inwd);
     98             }
     99             break;
    100         case 'A': /* looking for AE */
    101             if (inwd[1] == 'E') {
    102                 local.append(inwd, 1, inwd.length - 1);
    103             } else {
    104                 local.append(inwd);
    105             }
    106             break;
    107         case 'W' : /* looking for WR or WH */
    108             if (inwd[1] == 'R') {   // WR -> R
    109                 local.append(inwd, 1, inwd.length - 1);
    110                 break ;
    111             }
    112             if (inwd[1] == 'H') {
    113                 local.append(inwd, 1, inwd.length - 1);
    114                 local.setCharAt(0, 'W'); // WH -> W
    115             } else {
    116                 local.append(inwd);
    117             }
    118             break;
    119         case 'X' : /* initial X becomes S */
    120             inwd[0] = 'S';
    121             local.append(inwd);
    122             break ;
    123         default :
    124             local.append(inwd);
    125         } // now local has working string with initials fixed
    126 
    127         int wdsz = local.length();
    128         int n = 0 ;
    129 
    130         while ((code.length() < this.getMaxCodeLen()) &&
    131                (n < wdsz) ) { // max code size of 4 works well
    132             char symb = local.charAt(n) ;
    133             // remove duplicate letters except C
    134             if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
    135                 n++ ;
    136             } else { // not dup
    137                 switch(symb) {
    138                 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
    139                     if (n == 0) {
    140                         code.append(symb);
    141                     }
    142                     break ; // only use vowel if leading char
    143                 case 'B' :
    144                     if ( isPreviousChar(local, n, 'M') &&
    145                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
    146                         break;
    147                     }
    148                     code.append(symb);
    149                     break;
    150                 case 'C' : // lots of C special cases
    151                     /* discard if SCI, SCE or SCY */
    152                     if ( isPreviousChar(local, n, 'S') &&
    153                          !isLastChar(wdsz, n) &&
    154                          (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) {
    155                         break;
    156                     }
    157                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
    158                         code.append('X');
    159                         break;
    160                     }
    161                     if (!isLastChar(wdsz, n) &&
    162                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) {
    163                         code.append('S');
    164                         break; // CI,CE,CY -> S
    165                     }
    166                     if (isPreviousChar(local, n, 'S') &&
    167                         isNextChar(local, n, 'H') ) { // SCH->sk
    168                         code.append('K') ;
    169                         break ;
    170                     }
    171                     if (isNextChar(local, n, 'H')) { // detect CH
    172                         if ((n == 0) &&
    173                             (wdsz >= 3) &&
    174                             isVowel(local,2) ) { // CH consonant -> K consonant
    175                             code.append('K');
    176                         } else {
    177                             code.append('X'); // CHvowel -> X
    178                         }
    179                     } else {
    180                         code.append('K');
    181                     }
    182                     break ;
    183                 case 'D' :
    184                     if (!isLastChar(wdsz, n + 1) &&
    185                         isNextChar(local, n, 'G') &&
    186                         (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
    187                         code.append('J'); n += 2 ;
    188                     } else {
    189                         code.append('T');
    190                     }
    191                     break ;
    192                 case 'G' : // GH silent at end or before consonant
    193                     if (isLastChar(wdsz, n + 1) &&
    194                         isNextChar(local, n, 'H')) {
    195                         break;
    196                     }
    197                     if (!isLastChar(wdsz, n + 1) &&
    198                         isNextChar(local,n,'H') &&
    199                         !isVowel(local,n+2)) {
    200                         break;
    201                     }
    202                     if ((n > 0) &&
    203                         ( regionMatch(local, n, "GN") ||
    204                           regionMatch(local, n, "GNED") ) ) {
    205                         break; // silent G
    206                     }
    207                     if (isPreviousChar(local, n, 'G')) {
    208                         hard = true ;
    209                     } else {
    210                         hard = false ;
    211                     }
    212                     if (!isLastChar(wdsz, n) &&
    213                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0) &&
    214                         (!hard)) {
    215                         code.append('J');
    216                     } else {
    217                         code.append('K');
    218                     }
    219                     break ;
    220                 case 'H':
    221                     if (isLastChar(wdsz, n)) {
    222                         break ; // terminal H
    223                     }
    224                     if ((n > 0) &&
    225                         (this.varson.indexOf(local.charAt(n - 1)) >= 0)) {
    226                         break;
    227                     }
    228                     if (isVowel(local,n+1)) {
    229                         code.append('H'); // Hvowel
    230                     }
    231                     break;
    232                 case 'F':
    233                 case 'J' :
    234                 case 'L' :
    235                 case 'M':
    236                 case 'N' :
    237                 case 'R' :
    238                     code.append(symb);
    239                     break;
    240                 case 'K' :
    241                     if (n > 0) { // not initial
    242                         if (!isPreviousChar(local, n, 'C')) {
    243                             code.append(symb);
    244                         }
    245                     } else {
    246                         code.append(symb); // initial K
    247                     }
    248                     break ;
    249                 case 'P' :
    250                     if (isNextChar(local,n,'H')) {
    251                         // PH -> F
    252                         code.append('F');
    253                     } else {
    254                         code.append(symb);
    255                     }
    256                     break ;
    257                 case 'Q' :
    258                     code.append('K');
    259                     break;
    260                 case 'S' :
    261                     if (regionMatch(local,n,"SH") ||
    262                         regionMatch(local,n,"SIO") ||
    263                         regionMatch(local,n,"SIA")) {
    264                         code.append('X');
    265                     } else {
    266                         code.append('S');
    267                     }
    268                     break;
    269                 case 'T' :
    270                     if (regionMatch(local,n,"TIA") ||
    271                         regionMatch(local,n,"TIO")) {
    272                         code.append('X');
    273                         break;
    274                     }
    275                     if (regionMatch(local,n,"TCH")) {
    276                         // Silent if in "TCH"
    277                         break;
    278                     }
    279                     // substitute numeral 0 for TH (resembles theta after all)
    280                     if (regionMatch(local,n,"TH")) {
    281                         code.append('0');
    282                     } else {
    283                         code.append('T');
    284                     }
    285                     break ;
    286                 case 'V' :
    287                     code.append('F'); break ;
    288                 case 'W' : case 'Y' : // silent if not followed by vowel
    289                     if (!isLastChar(wdsz,n) &&
    290                         isVowel(local,n+1)) {
    291                         code.append(symb);
    292                     }
    293                     break ;
    294                 case 'X' :
    295                     code.append('K'); code.append('S');
    296                     break ;
    297                 case 'Z' :
    298                     code.append('S'); break ;
    299                 } // end switch
    300                 n++ ;
    301             } // end else from symb != 'C'
    302             if (code.length() > this.getMaxCodeLen()) {
    303                 code.setLength(this.getMaxCodeLen());
    304             }
    305         }
    306         return code.toString();
    307     }
    308 
    309     private boolean isVowel(StringBuffer string, int index) {
    310         return (this.vowels.indexOf(string.charAt(index)) >= 0);
    311     }
    312 
    313     private boolean isPreviousChar(StringBuffer string, int index, char c) {
    314         boolean matches = false;
    315         if( index > 0 &&
    316             index < string.length() ) {
    317             matches = string.charAt(index - 1) == c;
    318         }
    319         return matches;
    320     }
    321 
    322     private boolean isNextChar(StringBuffer string, int index, char c) {
    323         boolean matches = false;
    324         if( index >= 0 &&
    325             index < string.length() - 1 ) {
    326             matches = string.charAt(index + 1) == c;
    327         }
    328         return matches;
    329     }
    330 
    331     private boolean regionMatch(StringBuffer string, int index, String test) {
    332         boolean matches = false;
    333         if( index >= 0 &&
    334             (index + test.length() - 1) < string.length() ) {
    335             String substring = string.substring( index, index + test.length());
    336             matches = substring.equals( test );
    337         }
    338         return matches;
    339     }
    340 
    341     private boolean isLastChar(int wdsz, int n) {
    342         return n + 1 == wdsz;
    343     }
    344 
    345 
    346     /**
    347      * Encodes an Object using the metaphone algorithm.  This method
    348      * is provided in order to satisfy the requirements of the
    349      * Encoder interface, and will throw an EncoderException if the
    350      * supplied object is not of type java.lang.String.
    351      *
    352      * @param pObject Object to encode
    353      * @return An object (or type java.lang.String) containing the
    354      *         metaphone code which corresponds to the String supplied.
    355      * @throws EncoderException if the parameter supplied is not
    356      *                          of type java.lang.String
    357      */
    358     public Object encode(Object pObject) throws EncoderException {
    359         if (!(pObject instanceof java.lang.String)) {
    360             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
    361         }
    362         return metaphone((String) pObject);
    363     }
    364 
    365     /**
    366      * Encodes a String using the Metaphone algorithm.
    367      *
    368      * @param pString String object to encode
    369      * @return The metaphone code corresponding to the String supplied
    370      */
    371     public String encode(String pString) {
    372         return metaphone(pString);
    373     }
    374 
    375     /**
    376      * Tests is the metaphones of two strings are identical.
    377      *
    378      * @param str1 First of two strings to compare
    379      * @param str2 Second of two strings to compare
    380      * @return true if the metaphones of these strings are identical,
    381      *         false otherwise.
    382      */
    383     public boolean isMetaphoneEqual(String str1, String str2) {
    384         return metaphone(str1).equals(metaphone(str2));
    385     }
    386 
    387     /**
    388      * Returns the maxCodeLen.
    389      * @return int
    390      */
    391     public int getMaxCodeLen() { return this.maxCodeLen; }
    392 
    393     /**
    394      * Sets the maxCodeLen.
    395      * @param maxCodeLen The maxCodeLen to set
    396      */
    397     public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
    398 
    399 }
    400