Home | History | Annotate | Download | only in language
      1 /*
      2  * Copyright 2001-2004 The Apache Software Foundation.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package org.apache.commons.codec.language;
     18 
     19 import org.apache.commons.codec.EncoderException;
     20 import org.apache.commons.codec.StringEncoder;
     21 
     22 /**
     23  * Encodes a string into a metaphone value.
     24  * <p>
     25  * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
     26  * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
     27  * </p>
     28  * <p>
     29  * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
     30  * 39.</CITE>
     31  * </p>
     32  *
     33  * @author Apache Software Foundation
     34  * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $
     35  *
     36  * @deprecated Please use {@link java.net.URL#openConnection} instead.
     37  *     Please visit <a href="http://android-developers.blogspot.com/2011/09/androids-http-clients.html">this webpage</a>
     38  *     for further details.
     39  */
     40 @Deprecated
     41 public class Metaphone implements StringEncoder {
     42 
     43     /**
     44      * Five values in the English language
     45      */
     46     private String vowels = "AEIOU" ;
     47 
     48     /**
     49      * Variable used in Metaphone algorithm
     50      */
     51     private String frontv = "EIY"   ;
     52 
     53     /**
     54      * Variable used in Metaphone algorithm
     55      */
     56     private String varson = "CSPTG" ;
     57 
     58     /**
     59      * The max code length for metaphone is 4
     60      */
     61     private int maxCodeLen = 4 ;
     62 
     63     /**
     64      * Creates an instance of the Metaphone encoder
     65      */
     66     public Metaphone() {
     67         super();
     68     }
     69 
     70     /**
     71      * Find the metaphone value of a String. This is similar to the
     72      * soundex algorithm, but better at finding similar sounding words.
     73      * All input is converted to upper case.
     74      * Limitations: Input format is expected to be a single ASCII word
     75      * with only characters in the A - Z range, no punctuation or numbers.
     76      *
     77      * @param txt String to find the metaphone code for
     78      * @return A metaphone code corresponding to the String supplied
     79      */
     80     public String metaphone(String txt) {
     81         boolean hard = false ;
     82         if ((txt == null) || (txt.length() == 0)) {
     83             return "" ;
     84         }
     85         // single character is itself
     86         if (txt.length() == 1) {
     87             return txt.toUpperCase() ;
     88         }
     89 
     90         char[] inwd = txt.toUpperCase().toCharArray() ;
     91 
     92         StringBuffer local = new StringBuffer(40); // manipulate
     93         StringBuffer code = new StringBuffer(10) ; //   output
     94         // handle initial 2 characters exceptions
     95         switch(inwd[0]) {
     96         case 'K' :
     97         case 'G' :
     98         case 'P' : /* looking for KN, etc*/
     99             if (inwd[1] == 'N') {
    100                 local.append(inwd, 1, inwd.length - 1);
    101             } else {
    102                 local.append(inwd);
    103             }
    104             break;
    105         case 'A': /* looking for AE */
    106             if (inwd[1] == 'E') {
    107                 local.append(inwd, 1, inwd.length - 1);
    108             } else {
    109                 local.append(inwd);
    110             }
    111             break;
    112         case 'W' : /* looking for WR or WH */
    113             if (inwd[1] == 'R') {   // WR -> R
    114                 local.append(inwd, 1, inwd.length - 1);
    115                 break ;
    116             }
    117             if (inwd[1] == 'H') {
    118                 local.append(inwd, 1, inwd.length - 1);
    119                 local.setCharAt(0, 'W'); // WH -> W
    120             } else {
    121                 local.append(inwd);
    122             }
    123             break;
    124         case 'X' : /* initial X becomes S */
    125             inwd[0] = 'S';
    126             local.append(inwd);
    127             break ;
    128         default :
    129             local.append(inwd);
    130         } // now local has working string with initials fixed
    131 
    132         int wdsz = local.length();
    133         int n = 0 ;
    134 
    135         while ((code.length() < this.getMaxCodeLen()) &&
    136                (n < wdsz) ) { // max code size of 4 works well
    137             char symb = local.charAt(n) ;
    138             // remove duplicate letters except C
    139             if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
    140                 n++ ;
    141             } else { // not dup
    142                 switch(symb) {
    143                 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
    144                     if (n == 0) {
    145                         code.append(symb);
    146                     }
    147                     break ; // only use vowel if leading char
    148                 case 'B' :
    149                     if ( isPreviousChar(local, n, 'M') &&
    150                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
    151                         break;
    152                     }
    153                     code.append(symb);
    154                     break;
    155                 case 'C' : // lots of C special cases
    156                     /* discard if SCI, SCE or SCY */
    157                     if ( isPreviousChar(local, n, 'S') &&
    158                          !isLastChar(wdsz, n) &&
    159                          (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) {
    160                         break;
    161                     }
    162                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
    163                         code.append('X');
    164                         break;
    165                     }
    166                     if (!isLastChar(wdsz, n) &&
    167                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) {
    168                         code.append('S');
    169                         break; // CI,CE,CY -> S
    170                     }
    171                     if (isPreviousChar(local, n, 'S') &&
    172                         isNextChar(local, n, 'H') ) { // SCH->sk
    173                         code.append('K') ;
    174                         break ;
    175                     }
    176                     if (isNextChar(local, n, 'H')) { // detect CH
    177                         if ((n == 0) &&
    178                             (wdsz >= 3) &&
    179                             isVowel(local,2) ) { // CH consonant -> K consonant
    180                             code.append('K');
    181                         } else {
    182                             code.append('X'); // CHvowel -> X
    183                         }
    184                     } else {
    185                         code.append('K');
    186                     }
    187                     break ;
    188                 case 'D' :
    189                     if (!isLastChar(wdsz, n + 1) &&
    190                         isNextChar(local, n, 'G') &&
    191                         (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
    192                         code.append('J'); n += 2 ;
    193                     } else {
    194                         code.append('T');
    195                     }
    196                     break ;
    197                 case 'G' : // GH silent at end or before consonant
    198                     if (isLastChar(wdsz, n + 1) &&
    199                         isNextChar(local, n, 'H')) {
    200                         break;
    201                     }
    202                     if (!isLastChar(wdsz, n + 1) &&
    203                         isNextChar(local,n,'H') &&
    204                         !isVowel(local,n+2)) {
    205                         break;
    206                     }
    207                     if ((n > 0) &&
    208                         ( regionMatch(local, n, "GN") ||
    209                           regionMatch(local, n, "GNED") ) ) {
    210                         break; // silent G
    211                     }
    212                     if (isPreviousChar(local, n, 'G')) {
    213                         hard = true ;
    214                     } else {
    215                         hard = false ;
    216                     }
    217                     if (!isLastChar(wdsz, n) &&
    218                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0) &&
    219                         (!hard)) {
    220                         code.append('J');
    221                     } else {
    222                         code.append('K');
    223                     }
    224                     break ;
    225                 case 'H':
    226                     if (isLastChar(wdsz, n)) {
    227                         break ; // terminal H
    228                     }
    229                     if ((n > 0) &&
    230                         (this.varson.indexOf(local.charAt(n - 1)) >= 0)) {
    231                         break;
    232                     }
    233                     if (isVowel(local,n+1)) {
    234                         code.append('H'); // Hvowel
    235                     }
    236                     break;
    237                 case 'F':
    238                 case 'J' :
    239                 case 'L' :
    240                 case 'M':
    241                 case 'N' :
    242                 case 'R' :
    243                     code.append(symb);
    244                     break;
    245                 case 'K' :
    246                     if (n > 0) { // not initial
    247                         if (!isPreviousChar(local, n, 'C')) {
    248                             code.append(symb);
    249                         }
    250                     } else {
    251                         code.append(symb); // initial K
    252                     }
    253                     break ;
    254                 case 'P' :
    255                     if (isNextChar(local,n,'H')) {
    256                         // PH -> F
    257                         code.append('F');
    258                     } else {
    259                         code.append(symb);
    260                     }
    261                     break ;
    262                 case 'Q' :
    263                     code.append('K');
    264                     break;
    265                 case 'S' :
    266                     if (regionMatch(local,n,"SH") ||
    267                         regionMatch(local,n,"SIO") ||
    268                         regionMatch(local,n,"SIA")) {
    269                         code.append('X');
    270                     } else {
    271                         code.append('S');
    272                     }
    273                     break;
    274                 case 'T' :
    275                     if (regionMatch(local,n,"TIA") ||
    276                         regionMatch(local,n,"TIO")) {
    277                         code.append('X');
    278                         break;
    279                     }
    280                     if (regionMatch(local,n,"TCH")) {
    281                         // Silent if in "TCH"
    282                         break;
    283                     }
    284                     // substitute numeral 0 for TH (resembles theta after all)
    285                     if (regionMatch(local,n,"TH")) {
    286                         code.append('0');
    287                     } else {
    288                         code.append('T');
    289                     }
    290                     break ;
    291                 case 'V' :
    292                     code.append('F'); break ;
    293                 case 'W' : case 'Y' : // silent if not followed by vowel
    294                     if (!isLastChar(wdsz,n) &&
    295                         isVowel(local,n+1)) {
    296                         code.append(symb);
    297                     }
    298                     break ;
    299                 case 'X' :
    300                     code.append('K'); code.append('S');
    301                     break ;
    302                 case 'Z' :
    303                     code.append('S'); break ;
    304                 } // end switch
    305                 n++ ;
    306             } // end else from symb != 'C'
    307             if (code.length() > this.getMaxCodeLen()) {
    308                 code.setLength(this.getMaxCodeLen());
    309             }
    310         }
    311         return code.toString();
    312     }
    313 
    314     private boolean isVowel(StringBuffer string, int index) {
    315         return (this.vowels.indexOf(string.charAt(index)) >= 0);
    316     }
    317 
    318     private boolean isPreviousChar(StringBuffer string, int index, char c) {
    319         boolean matches = false;
    320         if( index > 0 &&
    321             index < string.length() ) {
    322             matches = string.charAt(index - 1) == c;
    323         }
    324         return matches;
    325     }
    326 
    327     private boolean isNextChar(StringBuffer string, int index, char c) {
    328         boolean matches = false;
    329         if( index >= 0 &&
    330             index < string.length() - 1 ) {
    331             matches = string.charAt(index + 1) == c;
    332         }
    333         return matches;
    334     }
    335 
    336     private boolean regionMatch(StringBuffer string, int index, String test) {
    337         boolean matches = false;
    338         if( index >= 0 &&
    339             (index + test.length() - 1) < string.length() ) {
    340             String substring = string.substring( index, index + test.length());
    341             matches = substring.equals( test );
    342         }
    343         return matches;
    344     }
    345 
    346     private boolean isLastChar(int wdsz, int n) {
    347         return n + 1 == wdsz;
    348     }
    349 
    350 
    351     /**
    352      * Encodes an Object using the metaphone algorithm.  This method
    353      * is provided in order to satisfy the requirements of the
    354      * Encoder interface, and will throw an EncoderException if the
    355      * supplied object is not of type java.lang.String.
    356      *
    357      * @param pObject Object to encode
    358      * @return An object (or type java.lang.String) containing the
    359      *         metaphone code which corresponds to the String supplied.
    360      * @throws EncoderException if the parameter supplied is not
    361      *                          of type java.lang.String
    362      */
    363     public Object encode(Object pObject) throws EncoderException {
    364         if (!(pObject instanceof java.lang.String)) {
    365             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
    366         }
    367         return metaphone((String) pObject);
    368     }
    369 
    370     /**
    371      * Encodes a String using the Metaphone algorithm.
    372      *
    373      * @param pString String object to encode
    374      * @return The metaphone code corresponding to the String supplied
    375      */
    376     public String encode(String pString) {
    377         return metaphone(pString);
    378     }
    379 
    380     /**
    381      * Tests is the metaphones of two strings are identical.
    382      *
    383      * @param str1 First of two strings to compare
    384      * @param str2 Second of two strings to compare
    385      * @return true if the metaphones of these strings are identical,
    386      *         false otherwise.
    387      */
    388     public boolean isMetaphoneEqual(String str1, String str2) {
    389         return metaphone(str1).equals(metaphone(str2));
    390     }
    391 
    392     /**
    393      * Returns the maxCodeLen.
    394      * @return int
    395      */
    396     public int getMaxCodeLen() { return this.maxCodeLen; }
    397 
    398     /**
    399      * Sets the maxCodeLen.
    400      * @param maxCodeLen The maxCodeLen to set
    401      */
    402     public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
    403 
    404 }
    405