Home | History | Annotate | Download | only in util
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License
     15  */
     16 package com.android.providers.contacts.aggregation.util;
     17 
     18 import java.util.Arrays;
     19 
     20 /**
     21  * A string distance calculator, particularly suited for name matching.
     22 * <p>
     23  * A detailed discussion of the topic of record linkage in general and name matching
     24  * in particular can be found in this article:
     25  * <blockquote>
     26  * Winkler, W. E. (2006). "Overview of Record Linkage and Current Research Directions".
     27  * Research Report Series, RRS.
     28  * </blockquote>
     29  */
     30 public class NameDistance {
     31 
     32     private static final float WINKLER_BONUS_THRESHOLD = 0.7f;
     33     private static final int MIN_EXACT_PREFIX_LENGTH = 3;
     34 
     35     private final int mMaxLength;
     36     private final boolean mPrefixOnly;
     37     private final boolean[] mMatchFlags1;
     38     private final boolean[] mMatchFlags2;
     39 
     40     /**
     41      * Constructor.
     42      *
     43      * @param maxLength byte arrays are truncated if longer than this number
     44      */
     45     public NameDistance(int maxLength) {
     46         mMaxLength = maxLength;
     47         mPrefixOnly = false;
     48         mMatchFlags1 = new boolean[maxLength];
     49         mMatchFlags2 = new boolean[maxLength];
     50     }
     51 
     52     /**
     53      * Constructor for a matcher that only checks if one string is the exact prefix of the other
     54      */
     55     public NameDistance() {
     56         mPrefixOnly = true;
     57         mMaxLength = 0;
     58         mMatchFlags1 = mMatchFlags2 = null;
     59     }
     60 
     61     /**
     62      * Computes a string distance between two normalized strings passed as byte arrays.
     63      */
     64     public float getDistance(byte bytes1[], byte bytes2[]) {
     65         byte[] array1, array2;
     66 
     67         if (bytes1.length > bytes2.length) {
     68             array2 = bytes1;
     69             array1 = bytes2;
     70         } else {
     71             array2 = bytes2;
     72             array1 = bytes1;
     73         }
     74 
     75         int length1 = array1.length;
     76         if (length1 >= MIN_EXACT_PREFIX_LENGTH) {
     77             boolean prefix = true;
     78             for (int i = 0; i < array1.length; i++) {
     79                 if (array1[i] != array2[i]) {
     80                     prefix = false;
     81                     break;
     82                 }
     83             }
     84             if (prefix) {
     85                 return 1.0f;
     86             }
     87         }
     88 
     89         if (mPrefixOnly) {
     90             return 0;
     91         }
     92 
     93         if (length1 > mMaxLength) {
     94             length1 = mMaxLength;
     95         }
     96 
     97         int length2 = array2.length;
     98         if (length2 > mMaxLength) {
     99             length2 = mMaxLength;
    100         }
    101 
    102         Arrays.fill(mMatchFlags1, 0, length1, false);
    103         Arrays.fill(mMatchFlags2, 0, length2, false);
    104 
    105         int range = length2 / 2 - 1;
    106         if (range < 0) {
    107             range = 0;
    108         }
    109 
    110         int matches = 0;
    111         for (int i = 0; i < length1; i++) {
    112             byte c1 = array1[i];
    113 
    114             int from = i - range;
    115             if (from < 0) {
    116                 from = 0;
    117             }
    118 
    119             int to = i + range + 1;
    120             if (to > length2) {
    121                 to = length2;
    122             }
    123 
    124             for (int j = from; j < to; j++) {
    125                 if (!mMatchFlags2[j] && c1 == array2[j]) {
    126                     mMatchFlags1[i] = mMatchFlags2[j] = true;
    127                     matches++;
    128                     break;
    129                 }
    130             }
    131         }
    132 
    133         if (matches == 0) {
    134             return 0f;
    135         }
    136 
    137         int transpositions = 0;
    138         int j = 0;
    139         for (int i = 0; i < length1; i++) {
    140             if (mMatchFlags1[i]) {
    141                 while (!mMatchFlags2[j]) {
    142                     j++;
    143                 }
    144                 if (array1[i] != array2[j]) {
    145                     transpositions++;
    146                 }
    147                 j++;
    148             }
    149         }
    150 
    151         float m = matches;
    152         float jaro = ((m / length1 + m / length2 + (m - (transpositions / 2f)) / m)) / 3;
    153 
    154         if (jaro < WINKLER_BONUS_THRESHOLD) {
    155             return jaro;
    156         }
    157 
    158         // Add Winkler bonus
    159         int prefix = 0;
    160         for (int i = 0; i < length1; i++) {
    161             if (bytes1[i] != bytes2[i]) {
    162                 break;
    163             }
    164             prefix++;
    165         }
    166 
    167         return jaro + Math.min(0.1f, 1f / length2) * prefix * (1 - jaro);
    168     }
    169 }
    170