1 /* 2 * Copyright 2001-2004 The Apache Software Foundation. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.apache.commons.codec.language; 18 19 import org.apache.commons.codec.EncoderException; 20 import org.apache.commons.codec.StringEncoder; 21 22 /** 23 * Encodes a string into a metaphone value. 24 * <p> 25 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 26 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. 27 * </p> 28 * <p> 29 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p 30 * 39.</CITE> 31 * </p> 32 * 33 * @author Apache Software Foundation 34 * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $ 35 */ 36 public class Metaphone implements StringEncoder { 37 38 /** 39 * Five values in the English language 40 */ 41 private String vowels = "AEIOU" ; 42 43 /** 44 * Variable used in Metaphone algorithm 45 */ 46 private String frontv = "EIY" ; 47 48 /** 49 * Variable used in Metaphone algorithm 50 */ 51 private String varson = "CSPTG" ; 52 53 /** 54 * The max code length for metaphone is 4 55 */ 56 private int maxCodeLen = 4 ; 57 58 /** 59 * Creates an instance of the Metaphone encoder 60 */ 61 public Metaphone() { 62 super(); 63 } 64 65 /** 66 * Find the metaphone value of a String. This is similar to the 67 * soundex algorithm, but better at finding similar sounding words. 68 * All input is converted to upper case. 69 * Limitations: Input format is expected to be a single ASCII word 70 * with only characters in the A - Z range, no punctuation or numbers. 71 * 72 * @param txt String to find the metaphone code for 73 * @return A metaphone code corresponding to the String supplied 74 */ 75 public String metaphone(String txt) { 76 boolean hard = false ; 77 if ((txt == null) || (txt.length() == 0)) { 78 return "" ; 79 } 80 // single character is itself 81 if (txt.length() == 1) { 82 return txt.toUpperCase() ; 83 } 84 85 char[] inwd = txt.toUpperCase().toCharArray() ; 86 87 StringBuffer local = new StringBuffer(40); // manipulate 88 StringBuffer code = new StringBuffer(10) ; // output 89 // handle initial 2 characters exceptions 90 switch(inwd[0]) { 91 case 'K' : 92 case 'G' : 93 case 'P' : /* looking for KN, etc*/ 94 if (inwd[1] == 'N') { 95 local.append(inwd, 1, inwd.length - 1); 96 } else { 97 local.append(inwd); 98 } 99 break; 100 case 'A': /* looking for AE */ 101 if (inwd[1] == 'E') { 102 local.append(inwd, 1, inwd.length - 1); 103 } else { 104 local.append(inwd); 105 } 106 break; 107 case 'W' : /* looking for WR or WH */ 108 if (inwd[1] == 'R') { // WR -> R 109 local.append(inwd, 1, inwd.length - 1); 110 break ; 111 } 112 if (inwd[1] == 'H') { 113 local.append(inwd, 1, inwd.length - 1); 114 local.setCharAt(0, 'W'); // WH -> W 115 } else { 116 local.append(inwd); 117 } 118 break; 119 case 'X' : /* initial X becomes S */ 120 inwd[0] = 'S'; 121 local.append(inwd); 122 break ; 123 default : 124 local.append(inwd); 125 } // now local has working string with initials fixed 126 127 int wdsz = local.length(); 128 int n = 0 ; 129 130 while ((code.length() < this.getMaxCodeLen()) && 131 (n < wdsz) ) { // max code size of 4 works well 132 char symb = local.charAt(n) ; 133 // remove duplicate letters except C 134 if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) { 135 n++ ; 136 } else { // not dup 137 switch(symb) { 138 case 'A' : case 'E' : case 'I' : case 'O' : case 'U' : 139 if (n == 0) { 140 code.append(symb); 141 } 142 break ; // only use vowel if leading char 143 case 'B' : 144 if ( isPreviousChar(local, n, 'M') && 145 isLastChar(wdsz, n) ) { // B is silent if word ends in MB 146 break; 147 } 148 code.append(symb); 149 break; 150 case 'C' : // lots of C special cases 151 /* discard if SCI, SCE or SCY */ 152 if ( isPreviousChar(local, n, 'S') && 153 !isLastChar(wdsz, n) && 154 (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) { 155 break; 156 } 157 if (regionMatch(local, n, "CIA")) { // "CIA" -> X 158 code.append('X'); 159 break; 160 } 161 if (!isLastChar(wdsz, n) && 162 (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) { 163 code.append('S'); 164 break; // CI,CE,CY -> S 165 } 166 if (isPreviousChar(local, n, 'S') && 167 isNextChar(local, n, 'H') ) { // SCH->sk 168 code.append('K') ; 169 break ; 170 } 171 if (isNextChar(local, n, 'H')) { // detect CH 172 if ((n == 0) && 173 (wdsz >= 3) && 174 isVowel(local,2) ) { // CH consonant -> K consonant 175 code.append('K'); 176 } else { 177 code.append('X'); // CHvowel -> X 178 } 179 } else { 180 code.append('K'); 181 } 182 break ; 183 case 'D' : 184 if (!isLastChar(wdsz, n + 1) && 185 isNextChar(local, n, 'G') && 186 (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J 187 code.append('J'); n += 2 ; 188 } else { 189 code.append('T'); 190 } 191 break ; 192 case 'G' : // GH silent at end or before consonant 193 if (isLastChar(wdsz, n + 1) && 194 isNextChar(local, n, 'H')) { 195 break; 196 } 197 if (!isLastChar(wdsz, n + 1) && 198 isNextChar(local,n,'H') && 199 !isVowel(local,n+2)) { 200 break; 201 } 202 if ((n > 0) && 203 ( regionMatch(local, n, "GN") || 204 regionMatch(local, n, "GNED") ) ) { 205 break; // silent G 206 } 207 if (isPreviousChar(local, n, 'G')) { 208 hard = true ; 209 } else { 210 hard = false ; 211 } 212 if (!isLastChar(wdsz, n) && 213 (this.frontv.indexOf(local.charAt(n + 1)) >= 0) && 214 (!hard)) { 215 code.append('J'); 216 } else { 217 code.append('K'); 218 } 219 break ; 220 case 'H': 221 if (isLastChar(wdsz, n)) { 222 break ; // terminal H 223 } 224 if ((n > 0) && 225 (this.varson.indexOf(local.charAt(n - 1)) >= 0)) { 226 break; 227 } 228 if (isVowel(local,n+1)) { 229 code.append('H'); // Hvowel 230 } 231 break; 232 case 'F': 233 case 'J' : 234 case 'L' : 235 case 'M': 236 case 'N' : 237 case 'R' : 238 code.append(symb); 239 break; 240 case 'K' : 241 if (n > 0) { // not initial 242 if (!isPreviousChar(local, n, 'C')) { 243 code.append(symb); 244 } 245 } else { 246 code.append(symb); // initial K 247 } 248 break ; 249 case 'P' : 250 if (isNextChar(local,n,'H')) { 251 // PH -> F 252 code.append('F'); 253 } else { 254 code.append(symb); 255 } 256 break ; 257 case 'Q' : 258 code.append('K'); 259 break; 260 case 'S' : 261 if (regionMatch(local,n,"SH") || 262 regionMatch(local,n,"SIO") || 263 regionMatch(local,n,"SIA")) { 264 code.append('X'); 265 } else { 266 code.append('S'); 267 } 268 break; 269 case 'T' : 270 if (regionMatch(local,n,"TIA") || 271 regionMatch(local,n,"TIO")) { 272 code.append('X'); 273 break; 274 } 275 if (regionMatch(local,n,"TCH")) { 276 // Silent if in "TCH" 277 break; 278 } 279 // substitute numeral 0 for TH (resembles theta after all) 280 if (regionMatch(local,n,"TH")) { 281 code.append('0'); 282 } else { 283 code.append('T'); 284 } 285 break ; 286 case 'V' : 287 code.append('F'); break ; 288 case 'W' : case 'Y' : // silent if not followed by vowel 289 if (!isLastChar(wdsz,n) && 290 isVowel(local,n+1)) { 291 code.append(symb); 292 } 293 break ; 294 case 'X' : 295 code.append('K'); code.append('S'); 296 break ; 297 case 'Z' : 298 code.append('S'); break ; 299 } // end switch 300 n++ ; 301 } // end else from symb != 'C' 302 if (code.length() > this.getMaxCodeLen()) { 303 code.setLength(this.getMaxCodeLen()); 304 } 305 } 306 return code.toString(); 307 } 308 309 private boolean isVowel(StringBuffer string, int index) { 310 return (this.vowels.indexOf(string.charAt(index)) >= 0); 311 } 312 313 private boolean isPreviousChar(StringBuffer string, int index, char c) { 314 boolean matches = false; 315 if( index > 0 && 316 index < string.length() ) { 317 matches = string.charAt(index - 1) == c; 318 } 319 return matches; 320 } 321 322 private boolean isNextChar(StringBuffer string, int index, char c) { 323 boolean matches = false; 324 if( index >= 0 && 325 index < string.length() - 1 ) { 326 matches = string.charAt(index + 1) == c; 327 } 328 return matches; 329 } 330 331 private boolean regionMatch(StringBuffer string, int index, String test) { 332 boolean matches = false; 333 if( index >= 0 && 334 (index + test.length() - 1) < string.length() ) { 335 String substring = string.substring( index, index + test.length()); 336 matches = substring.equals( test ); 337 } 338 return matches; 339 } 340 341 private boolean isLastChar(int wdsz, int n) { 342 return n + 1 == wdsz; 343 } 344 345 346 /** 347 * Encodes an Object using the metaphone algorithm. This method 348 * is provided in order to satisfy the requirements of the 349 * Encoder interface, and will throw an EncoderException if the 350 * supplied object is not of type java.lang.String. 351 * 352 * @param pObject Object to encode 353 * @return An object (or type java.lang.String) containing the 354 * metaphone code which corresponds to the String supplied. 355 * @throws EncoderException if the parameter supplied is not 356 * of type java.lang.String 357 */ 358 public Object encode(Object pObject) throws EncoderException { 359 if (!(pObject instanceof java.lang.String)) { 360 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 361 } 362 return metaphone((String) pObject); 363 } 364 365 /** 366 * Encodes a String using the Metaphone algorithm. 367 * 368 * @param pString String object to encode 369 * @return The metaphone code corresponding to the String supplied 370 */ 371 public String encode(String pString) { 372 return metaphone(pString); 373 } 374 375 /** 376 * Tests is the metaphones of two strings are identical. 377 * 378 * @param str1 First of two strings to compare 379 * @param str2 Second of two strings to compare 380 * @return true if the metaphones of these strings are identical, 381 * false otherwise. 382 */ 383 public boolean isMetaphoneEqual(String str1, String str2) { 384 return metaphone(str1).equals(metaphone(str2)); 385 } 386 387 /** 388 * Returns the maxCodeLen. 389 * @return int 390 */ 391 public int getMaxCodeLen() { return this.maxCodeLen; } 392 393 /** 394 * Sets the maxCodeLen. 395 * @param maxCodeLen The maxCodeLen to set 396 */ 397 public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; } 398 399 } 400