Home | History | Annotate | Download | only in language
      1 /*
      2  * Copyright 2001-2004 The Apache Software Foundation.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package org.apache.commons.codec.language;
     18 
     19 import org.apache.commons.codec.EncoderException;
     20 import org.apache.commons.codec.StringEncoder;
     21 
     22 /**
     23  * Encodes a string into a Refined Soundex value. A refined soundex code is
     24  * optimized for spell checking words. Soundex method originally developed by
     25  * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>.
     26  *
     27  * @author Apache Software Foundation
     28  * @version $Id: RefinedSoundex.java,v 1.21 2004/06/05 18:32:04 ggregory Exp $
     29  */
     30 public class RefinedSoundex implements StringEncoder {
     31 
     32     /**
     33      * This static variable contains an instance of the RefinedSoundex using
     34      * the US_ENGLISH mapping.
     35      */
     36     public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
     37 
     38     /**
     39      * RefinedSoundex is *refined* for a number of reasons one being that the
     40      * mappings have been altered. This implementation contains default
     41      * mappings for US English.
     42      */
     43     public static final char[] US_ENGLISH_MAPPING = "01360240043788015936020505".toCharArray();
     44 
     45     /**
     46      * Every letter of the alphabet is "mapped" to a numerical value. This char
     47      * array holds the values to which each letter is mapped. This
     48      * implementation contains a default map for US_ENGLISH
     49      */
     50     private char[] soundexMapping;
     51 
     52     /**
     53      * Creates an instance of the RefinedSoundex object using the default US
     54      * English mapping.
     55      */
     56     public RefinedSoundex() {
     57         this(US_ENGLISH_MAPPING);
     58     }
     59 
     60     /**
     61      * Creates a refined soundex instance using a custom mapping. This
     62      * constructor can be used to customize the mapping, and/or possibly
     63      * provide an internationalized mapping for a non-Western character set.
     64      *
     65      * @param mapping
     66      *                  Mapping array to use when finding the corresponding code for
     67      *                  a given character
     68      */
     69     public RefinedSoundex(char[] mapping) {
     70         this.soundexMapping = mapping;
     71     }
     72 
     73     // BEGIN android-note
     74     // Removed @see reference to SoundexUtils below, since the class isn't
     75     // public.
     76     // END android-note
     77     /**
     78      * Returns the number of characters in the two encoded Strings that are the
     79      * same. This return value ranges from 0 to the length of the shortest
     80      * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
     81      * example) indicates strong similarity or identical values. For refined
     82      * Soundex, the return value can be greater than 4.
     83      *
     84      * @param s1
     85      *                  A String that will be encoded and compared.
     86      * @param s2
     87      *                  A String that will be encoded and compared.
     88      * @return The number of characters in the two encoded Strings that are the
     89      *             same from 0 to to the length of the shortest encoded String.
     90      *
     91      * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
     92      *          MS T-SQL DIFFERENCE</a>
     93      *
     94      * @throws EncoderException
     95      *                  if an error occurs encoding one of the strings
     96      * @since 1.3
     97      */
     98     public int difference(String s1, String s2) throws EncoderException {
     99         return SoundexUtils.difference(this, s1, s2);
    100     }
    101 
    102     /**
    103      * Encodes an Object using the refined soundex algorithm. This method is
    104      * provided in order to satisfy the requirements of the Encoder interface,
    105      * and will throw an EncoderException if the supplied object is not of type
    106      * java.lang.String.
    107      *
    108      * @param pObject
    109      *                  Object to encode
    110      * @return An object (or type java.lang.String) containing the refined
    111      *             soundex code which corresponds to the String supplied.
    112      * @throws EncoderException
    113      *                  if the parameter supplied is not of type java.lang.String
    114      */
    115     public Object encode(Object pObject) throws EncoderException {
    116         if (!(pObject instanceof java.lang.String)) {
    117             throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
    118         }
    119         return soundex((String) pObject);
    120     }
    121 
    122     /**
    123      * Encodes a String using the refined soundex algorithm.
    124      *
    125      * @param pString
    126      *                  A String object to encode
    127      * @return A Soundex code corresponding to the String supplied
    128      */
    129     public String encode(String pString) {
    130         return soundex(pString);
    131     }
    132 
    133     /**
    134      * Returns the mapping code for a given character. The mapping codes are
    135      * maintained in an internal char array named soundexMapping, and the
    136      * default values of these mappings are US English.
    137      *
    138      * @param c
    139      *                  char to get mapping for
    140      * @return A character (really a numeral) to return for the given char
    141      */
    142     char getMappingCode(char c) {
    143         if (!Character.isLetter(c)) {
    144             return 0;
    145         }
    146         return this.soundexMapping[Character.toUpperCase(c) - 'A'];
    147     }
    148 
    149     /**
    150      * Retreives the Refined Soundex code for a given String object.
    151      *
    152      * @param str
    153      *                  String to encode using the Refined Soundex algorithm
    154      * @return A soundex code for the String supplied
    155      */
    156     public String soundex(String str) {
    157         if (str == null) {
    158             return null;
    159         }
    160         str = SoundexUtils.clean(str);
    161         if (str.length() == 0) {
    162             return str;
    163         }
    164 
    165         StringBuffer sBuf = new StringBuffer();
    166         sBuf.append(str.charAt(0));
    167 
    168         char last, current;
    169         last = '*';
    170 
    171         for (int i = 0; i < str.length(); i++) {
    172 
    173             current = getMappingCode(str.charAt(i));
    174             if (current == last) {
    175                 continue;
    176             } else if (current != 0) {
    177                 sBuf.append(current);
    178             }
    179 
    180             last = current;
    181 
    182         }
    183 
    184         return sBuf.toString();
    185     }
    186 }
    187