1 /* 2 * Copyright 2001-2004 The Apache Software Foundation. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.apache.commons.codec.language; 18 19 import org.apache.commons.codec.EncoderException; 20 import org.apache.commons.codec.StringEncoder; 21 22 /** 23 * Encodes a string into a metaphone value. 24 * <p> 25 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 26 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. 27 * </p> 28 * <p> 29 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p 30 * 39.</CITE> 31 * </p> 32 * 33 * @author Apache Software Foundation 34 * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $ 35 * 36 * @deprecated Please use {@link java.net.URL#openConnection} instead. 37 * Please visit <a href="http://android-developers.blogspot.com/2011/09/androids-http-clients.html">this webpage</a> 38 * for further details. 39 */ 40 @Deprecated 41 public class Metaphone implements StringEncoder { 42 43 /** 44 * Five values in the English language 45 */ 46 private String vowels = "AEIOU" ; 47 48 /** 49 * Variable used in Metaphone algorithm 50 */ 51 private String frontv = "EIY" ; 52 53 /** 54 * Variable used in Metaphone algorithm 55 */ 56 private String varson = "CSPTG" ; 57 58 /** 59 * The max code length for metaphone is 4 60 */ 61 private int maxCodeLen = 4 ; 62 63 /** 64 * Creates an instance of the Metaphone encoder 65 */ 66 public Metaphone() { 67 super(); 68 } 69 70 /** 71 * Find the metaphone value of a String. This is similar to the 72 * soundex algorithm, but better at finding similar sounding words. 73 * All input is converted to upper case. 74 * Limitations: Input format is expected to be a single ASCII word 75 * with only characters in the A - Z range, no punctuation or numbers. 76 * 77 * @param txt String to find the metaphone code for 78 * @return A metaphone code corresponding to the String supplied 79 */ 80 public String metaphone(String txt) { 81 boolean hard = false ; 82 if ((txt == null) || (txt.length() == 0)) { 83 return "" ; 84 } 85 // single character is itself 86 if (txt.length() == 1) { 87 return txt.toUpperCase() ; 88 } 89 90 char[] inwd = txt.toUpperCase().toCharArray() ; 91 92 StringBuffer local = new StringBuffer(40); // manipulate 93 StringBuffer code = new StringBuffer(10) ; // output 94 // handle initial 2 characters exceptions 95 switch(inwd[0]) { 96 case 'K' : 97 case 'G' : 98 case 'P' : /* looking for KN, etc*/ 99 if (inwd[1] == 'N') { 100 local.append(inwd, 1, inwd.length - 1); 101 } else { 102 local.append(inwd); 103 } 104 break; 105 case 'A': /* looking for AE */ 106 if (inwd[1] == 'E') { 107 local.append(inwd, 1, inwd.length - 1); 108 } else { 109 local.append(inwd); 110 } 111 break; 112 case 'W' : /* looking for WR or WH */ 113 if (inwd[1] == 'R') { // WR -> R 114 local.append(inwd, 1, inwd.length - 1); 115 break ; 116 } 117 if (inwd[1] == 'H') { 118 local.append(inwd, 1, inwd.length - 1); 119 local.setCharAt(0, 'W'); // WH -> W 120 } else { 121 local.append(inwd); 122 } 123 break; 124 case 'X' : /* initial X becomes S */ 125 inwd[0] = 'S'; 126 local.append(inwd); 127 break ; 128 default : 129 local.append(inwd); 130 } // now local has working string with initials fixed 131 132 int wdsz = local.length(); 133 int n = 0 ; 134 135 while ((code.length() < this.getMaxCodeLen()) && 136 (n < wdsz) ) { // max code size of 4 works well 137 char symb = local.charAt(n) ; 138 // remove duplicate letters except C 139 if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) { 140 n++ ; 141 } else { // not dup 142 switch(symb) { 143 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : 144 if (n == 0) { 145 code.append(symb); 146 } 147 break ; // only use vowel if leading char 148 case 'B' : 149 if ( isPreviousChar(local, n, 'M') && 150 isLastChar(wdsz, n) ) { // B is silent if word ends in MB 151 break; 152 } 153 code.append(symb); 154 break; 155 case 'C' : // lots of C special cases 156 /* discard if SCI, SCE or SCY */ 157 if ( isPreviousChar(local, n, 'S') && 158 !isLastChar(wdsz, n) && 159 (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) { 160 break; 161 } 162 if (regionMatch(local, n, "CIA")) { // "CIA" -> X 163 code.append('X'); 164 break; 165 } 166 if (!isLastChar(wdsz, n) && 167 (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) { 168 code.append('S'); 169 break; // CI,CE,CY -> S 170 } 171 if (isPreviousChar(local, n, 'S') && 172 isNextChar(local, n, 'H') ) { // SCH->sk 173 code.append('K') ; 174 break ; 175 } 176 if (isNextChar(local, n, 'H')) { // detect CH 177 if ((n == 0) && 178 (wdsz >= 3) && 179 isVowel(local,2) ) { // CH consonant -> K consonant 180 code.append('K'); 181 } else { 182 code.append('X'); // CHvowel -> X 183 } 184 } else { 185 code.append('K'); 186 } 187 break ; 188 case 'D' : 189 if (!isLastChar(wdsz, n + 1) && 190 isNextChar(local, n, 'G') && 191 (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J 192 code.append('J'); n += 2 ; 193 } else { 194 code.append('T'); 195 } 196 break ; 197 case 'G' : // GH silent at end or before consonant 198 if (isLastChar(wdsz, n + 1) && 199 isNextChar(local, n, 'H')) { 200 break; 201 } 202 if (!isLastChar(wdsz, n + 1) && 203 isNextChar(local,n,'H') && 204 !isVowel(local,n+2)) { 205 break; 206 } 207 if ((n > 0) && 208 ( regionMatch(local, n, "GN") || 209 regionMatch(local, n, "GNED") ) ) { 210 break; // silent G 211 } 212 if (isPreviousChar(local, n, 'G')) { 213 hard = true ; 214 } else { 215 hard = false ; 216 } 217 if (!isLastChar(wdsz, n) && 218 (this.frontv.indexOf(local.charAt(n + 1)) >= 0) && 219 (!hard)) { 220 code.append('J'); 221 } else { 222 code.append('K'); 223 } 224 break ; 225 case 'H': 226 if (isLastChar(wdsz, n)) { 227 break ; // terminal H 228 } 229 if ((n > 0) && 230 (this.varson.indexOf(local.charAt(n - 1)) >= 0)) { 231 break; 232 } 233 if (isVowel(local,n+1)) { 234 code.append('H'); // Hvowel 235 } 236 break; 237 case 'F': 238 case 'J' : 239 case 'L' : 240 case 'M': 241 case 'N' : 242 case 'R' : 243 code.append(symb); 244 break; 245 case 'K' : 246 if (n > 0) { // not initial 247 if (!isPreviousChar(local, n, 'C')) { 248 code.append(symb); 249 } 250 } else { 251 code.append(symb); // initial K 252 } 253 break ; 254 case 'P' : 255 if (isNextChar(local,n,'H')) { 256 // PH -> F 257 code.append('F'); 258 } else { 259 code.append(symb); 260 } 261 break ; 262 case 'Q' : 263 code.append('K'); 264 break; 265 case 'S' : 266 if (regionMatch(local,n,"SH") || 267 regionMatch(local,n,"SIO") || 268 regionMatch(local,n,"SIA")) { 269 code.append('X'); 270 } else { 271 code.append('S'); 272 } 273 break; 274 case 'T' : 275 if (regionMatch(local,n,"TIA") || 276 regionMatch(local,n,"TIO")) { 277 code.append('X'); 278 break; 279 } 280 if (regionMatch(local,n,"TCH")) { 281 // Silent if in "TCH" 282 break; 283 } 284 // substitute numeral 0 for TH (resembles theta after all) 285 if (regionMatch(local,n,"TH")) { 286 code.append('0'); 287 } else { 288 code.append('T'); 289 } 290 break ; 291 case 'V' : 292 code.append('F'); break ; 293 case 'W' : case 'Y' : // silent if not followed by vowel 294 if (!isLastChar(wdsz,n) && 295 isVowel(local,n+1)) { 296 code.append(symb); 297 } 298 break ; 299 case 'X' : 300 code.append('K'); code.append('S'); 301 break ; 302 case 'Z' : 303 code.append('S'); break ; 304 } // end switch 305 n++ ; 306 } // end else from symb != 'C' 307 if (code.length() > this.getMaxCodeLen()) { 308 code.setLength(this.getMaxCodeLen()); 309 } 310 } 311 return code.toString(); 312 } 313 314 private boolean isVowel(StringBuffer string, int index) { 315 return (this.vowels.indexOf(string.charAt(index)) >= 0); 316 } 317 318 private boolean isPreviousChar(StringBuffer string, int index, char c) { 319 boolean matches = false; 320 if( index > 0 && 321 index < string.length() ) { 322 matches = string.charAt(index - 1) == c; 323 } 324 return matches; 325 } 326 327 private boolean isNextChar(StringBuffer string, int index, char c) { 328 boolean matches = false; 329 if( index >= 0 && 330 index < string.length() - 1 ) { 331 matches = string.charAt(index + 1) == c; 332 } 333 return matches; 334 } 335 336 private boolean regionMatch(StringBuffer string, int index, String test) { 337 boolean matches = false; 338 if( index >= 0 && 339 (index + test.length() - 1) < string.length() ) { 340 String substring = string.substring( index, index + test.length()); 341 matches = substring.equals( test ); 342 } 343 return matches; 344 } 345 346 private boolean isLastChar(int wdsz, int n) { 347 return n + 1 == wdsz; 348 } 349 350 351 /** 352 * Encodes an Object using the metaphone algorithm. This method 353 * is provided in order to satisfy the requirements of the 354 * Encoder interface, and will throw an EncoderException if the 355 * supplied object is not of type java.lang.String. 356 * 357 * @param pObject Object to encode 358 * @return An object (or type java.lang.String) containing the 359 * metaphone code which corresponds to the String supplied. 360 * @throws EncoderException if the parameter supplied is not 361 * of type java.lang.String 362 */ 363 public Object encode(Object pObject) throws EncoderException { 364 if (!(pObject instanceof java.lang.String)) { 365 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 366 } 367 return metaphone((String) pObject); 368 } 369 370 /** 371 * Encodes a String using the Metaphone algorithm. 372 * 373 * @param pString String object to encode 374 * @return The metaphone code corresponding to the String supplied 375 */ 376 public String encode(String pString) { 377 return metaphone(pString); 378 } 379 380 /** 381 * Tests is the metaphones of two strings are identical. 382 * 383 * @param str1 First of two strings to compare 384 * @param str2 Second of two strings to compare 385 * @return true if the metaphones of these strings are identical, 386 * false otherwise. 387 */ 388 public boolean isMetaphoneEqual(String str1, String str2) { 389 return metaphone(str1).equals(metaphone(str2)); 390 } 391 392 /** 393 * Returns the maxCodeLen. 394 * @return int 395 */ 396 public int getMaxCodeLen() { return this.maxCodeLen; } 397 398 /** 399 * Sets the maxCodeLen. 400 * @param maxCodeLen The maxCodeLen to set 401 */ 402 public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; } 403 404 } 405