Home | History | Annotate | Download | only in contacts
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License
     15  */
     16 package com.android.providers.contacts;
     17 
     18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
     19 
     20 import java.util.ArrayList;
     21 import java.util.Collections;
     22 import java.util.HashMap;
     23 import java.util.List;
     24 
     25 /**
     26  * Logic for matching contacts' data and accumulating match scores.
     27  */
     28 public class ContactMatcher {
     29 
     30     // Best possible match score
     31     public static final int MAX_SCORE = 100;
     32 
     33     // Suggest to aggregate contacts if their match score is equal or greater than this threshold
     34     public static final int SCORE_THRESHOLD_SUGGEST = 50;
     35 
     36     // Automatically aggregate contacts if their match score is equal or greater than this threshold
     37     public static final int SCORE_THRESHOLD_PRIMARY = 70;
     38 
     39     // Automatically aggregate contacts if the match score is equal or greater than this threshold
     40     // and there is a secondary match (phone number, email etc).
     41     public static final int SCORE_THRESHOLD_SECONDARY = 50;
     42 
     43     // Score for missing data (as opposed to present data but a bad match)
     44     private static final int NO_DATA_SCORE = -1;
     45 
     46     // Score for matching phone numbers
     47     private static final int PHONE_MATCH_SCORE = 71;
     48 
     49     // Score for matching email addresses
     50     private static final int EMAIL_MATCH_SCORE = 71;
     51 
     52     // Score for matching nickname
     53     private static final int NICKNAME_MATCH_SCORE = 71;
     54 
     55     // Maximum number of characters in a name to be considered by the matching algorithm.
     56     private static final int MAX_MATCHED_NAME_LENGTH = 30;
     57 
     58     // Scores a multiplied by this number to allow room for "fractional" scores
     59     private static final int SCORE_SCALE = 1000;
     60 
     61     public static final int MATCHING_ALGORITHM_EXACT = 0;
     62     public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
     63     public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
     64 
     65     // Minimum edit distance between two names to be considered an approximate match
     66     public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
     67 
     68     // Minimum edit distance between two email ids to be considered an approximate match
     69     public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
     70 
     71     /**
     72      * Name matching scores: a matrix by name type vs. candidate lookup type.
     73      * For example, if the name type is "full name" while we are looking for a
     74      * "full name", the score may be 99. If we are looking for a "nickname" but
     75      * find "first name", the score may be 50 (see specific scores defined
     76      * below.)
     77      * <p>
     78      * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
     79      * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
     80      * match producing the score of 70.  The score may also be 0 if the similarity (distance)
     81      * between the strings is below the threshold.
     82      * <p>
     83      * We use a string matching algorithm, which is particularly suited for
     84      * name matching. See {@link NameDistance}.
     85      */
     86     private static int[] sMinScore =
     87             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
     88     private static int[] sMaxScore =
     89             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
     90 
     91     /*
     92      * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
     93      * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
     94      * not!  They are useful in three-way aggregation cases when we have, for example, both
     95      * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
     96      * with the former rather than the latter.  This is why "reverse" matches have slightly lower
     97      * scores than direct matches.
     98      */
     99     static {
    100         setScoreRange(NameLookupType.NAME_EXACT,
    101                 NameLookupType.NAME_EXACT, 99, 99);
    102         setScoreRange(NameLookupType.NAME_VARIANT,
    103                 NameLookupType.NAME_VARIANT, 90, 90);
    104         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    105                 NameLookupType.NAME_COLLATION_KEY, 50, 80);
    106 
    107         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    108                 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
    109         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    110                 NameLookupType.NICKNAME, 50, 60);
    111 
    112         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    113                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
    114         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    115                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
    116         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    117                 NameLookupType.NICKNAME, 50, 60);
    118 
    119         setScoreRange(NameLookupType.NICKNAME,
    120                 NameLookupType.NICKNAME, 50, 60);
    121         setScoreRange(NameLookupType.NICKNAME,
    122                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
    123         setScoreRange(NameLookupType.NICKNAME,
    124                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
    125     }
    126 
    127     /**
    128      * Populates the cells of the score matrix and score span matrix
    129      * corresponding to the {@code candidateNameType} and {@code nameType}.
    130      */
    131     private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) {
    132         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    133         sMinScore[index] = scoreFrom;
    134         sMaxScore[index] = scoreTo;
    135     }
    136 
    137     /**
    138      * Returns the lower range for the match score for the given {@code candidateNameType} and
    139      * {@code nameType}.
    140      */
    141     private static int getMinScore(int candidateNameType, int nameType) {
    142         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    143         return sMinScore[index];
    144     }
    145 
    146     /**
    147      * Returns the upper range for the match score for the given {@code candidateNameType} and
    148      * {@code nameType}.
    149      */
    150     private static int getMaxScore(int candidateNameType, int nameType) {
    151         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    152         return sMaxScore[index];
    153     }
    154 
    155     /**
    156      * Captures the max score and match count for a specific contact.  Used in an
    157      * contactId - MatchScore map.
    158      */
    159     public static class MatchScore implements Comparable<MatchScore> {
    160         private long mContactId;
    161         private boolean mKeepIn;
    162         private boolean mKeepOut;
    163         private int mPrimaryScore;
    164         private int mSecondaryScore;
    165         private int mMatchCount;
    166 
    167         public MatchScore(long contactId) {
    168             this.mContactId = contactId;
    169         }
    170 
    171         public void reset(long contactId) {
    172             this.mContactId = contactId;
    173             mKeepIn = false;
    174             mKeepOut = false;
    175             mPrimaryScore = 0;
    176             mSecondaryScore = 0;
    177             mMatchCount = 0;
    178         }
    179 
    180         public long getContactId() {
    181             return mContactId;
    182         }
    183 
    184         public void updatePrimaryScore(int score) {
    185             if (score > mPrimaryScore) {
    186                 mPrimaryScore = score;
    187             }
    188             mMatchCount++;
    189         }
    190 
    191         public void updateSecondaryScore(int score) {
    192             if (score > mSecondaryScore) {
    193                 mSecondaryScore = score;
    194             }
    195             mMatchCount++;
    196         }
    197 
    198         public void keepIn() {
    199             mKeepIn = true;
    200         }
    201 
    202         public void keepOut() {
    203             mKeepOut = true;
    204         }
    205 
    206         public int getScore() {
    207             if (mKeepOut) {
    208                 return 0;
    209             }
    210 
    211             if (mKeepIn) {
    212                 return MAX_SCORE;
    213             }
    214 
    215             int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore);
    216 
    217             // Ensure that of two contacts with the same match score the one with more matching
    218             // data elements wins.
    219             return score * SCORE_SCALE + mMatchCount;
    220         }
    221 
    222         /**
    223          * Descending order of match score.
    224          */
    225         public int compareTo(MatchScore another) {
    226             return another.getScore() - getScore();
    227         }
    228 
    229         @Override
    230         public String toString() {
    231             return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount
    232                     + ")";
    233         }
    234     }
    235 
    236     private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>();
    237     private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
    238     private int mScoreCount = 0;
    239 
    240     private final NameDistance mNameDistanceConservative = new NameDistance();
    241     private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
    242 
    243     private MatchScore getMatchingScore(long contactId) {
    244         MatchScore matchingScore = mScores.get(contactId);
    245         if (matchingScore == null) {
    246             if (mScoreList.size() > mScoreCount) {
    247                 matchingScore = mScoreList.get(mScoreCount);
    248                 matchingScore.reset(contactId);
    249             } else {
    250                 matchingScore = new MatchScore(contactId);
    251                 mScoreList.add(matchingScore);
    252             }
    253             mScoreCount++;
    254             mScores.put(contactId, matchingScore);
    255         }
    256         return matchingScore;
    257     }
    258 
    259     /**
    260      * Checks if there is a match and updates the overall score for the
    261      * specified contact for a discovered match. The new score is determined
    262      * by the prior score, by the type of name we were looking for, the type
    263      * of name we found and, if the match is approximate, the distance between the candidate and
    264      * actual name.
    265      */
    266     public void matchName(long contactId, int candidateNameType, String candidateName,
    267             int nameType, String name, int algorithm) {
    268         int maxScore = getMaxScore(candidateNameType, nameType);
    269         if (maxScore == 0) {
    270             return;
    271         }
    272 
    273         if (candidateName.equals(name)) {
    274             updatePrimaryScore(contactId, maxScore);
    275             return;
    276         }
    277 
    278         if (algorithm == MATCHING_ALGORITHM_EXACT) {
    279             return;
    280         }
    281 
    282         int minScore = getMinScore(candidateNameType, nameType);
    283         if (minScore == maxScore) {
    284             return;
    285         }
    286 
    287         byte[] decodedCandidateName = Hex.decodeHex(candidateName);
    288         byte[] decodedName = Hex.decodeHex(name);
    289 
    290         NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
    291                 mNameDistanceConservative : mNameDistanceApproximate;
    292 
    293         int score;
    294         float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
    295         boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
    296                 || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
    297         float threshold = emailBased
    298                 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
    299                 : APPROXIMATE_MATCH_THRESHOLD;
    300         if (distance > threshold) {
    301             score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
    302         } else {
    303             score = 0;
    304         }
    305 
    306         updatePrimaryScore(contactId, score);
    307     }
    308 
    309     public void updateScoreWithPhoneNumberMatch(long contactId) {
    310         updateSecondaryScore(contactId, PHONE_MATCH_SCORE);
    311     }
    312 
    313     public void updateScoreWithEmailMatch(long contactId) {
    314         updateSecondaryScore(contactId, EMAIL_MATCH_SCORE);
    315     }
    316 
    317     public void updateScoreWithNicknameMatch(long contactId) {
    318         updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE);
    319     }
    320 
    321     private void updatePrimaryScore(long contactId, int score) {
    322         getMatchingScore(contactId).updatePrimaryScore(score);
    323     }
    324 
    325     private void updateSecondaryScore(long contactId, int score) {
    326         getMatchingScore(contactId).updateSecondaryScore(score);
    327     }
    328 
    329     public void keepIn(long contactId) {
    330         getMatchingScore(contactId).keepIn();
    331     }
    332 
    333     public void keepOut(long contactId) {
    334         getMatchingScore(contactId).keepOut();
    335     }
    336 
    337     public void clear() {
    338         mScores.clear();
    339         mScoreCount = 0;
    340     }
    341 
    342     /**
    343      * Returns a list of IDs for contacts that are matched on secondary data elements
    344      * (phone number, email address, nickname). We still need to obtain the approximate
    345      * primary score for those contacts to determine if any of them should be aggregated.
    346      * <p>
    347      * May return null.
    348      */
    349     public List<Long> prepareSecondaryMatchCandidates(int threshold) {
    350         ArrayList<Long> contactIds = null;
    351 
    352         for (int i = 0; i < mScoreCount; i++) {
    353             MatchScore score = mScoreList.get(i);
    354             if (score.mKeepOut) {
    355                 continue;
    356             }
    357 
    358             int s = score.mSecondaryScore;
    359             if (s >= threshold) {
    360                 if (contactIds == null) {
    361                     contactIds = new ArrayList<Long>();
    362                 }
    363                 contactIds.add(score.mContactId);
    364             }
    365             score.mPrimaryScore = NO_DATA_SCORE;
    366         }
    367         return contactIds;
    368     }
    369 
    370     /**
    371      * Returns the contactId with the best match score over the specified threshold or -1
    372      * if no such contact is found.
    373      */
    374     public long pickBestMatch(int threshold) {
    375         long contactId = -1;
    376         int maxScore = 0;
    377         for (int i = 0; i < mScoreCount; i++) {
    378             MatchScore score = mScoreList.get(i);
    379             if (score.mKeepOut) {
    380                 continue;
    381             }
    382 
    383             if (score.mKeepIn) {
    384                 return score.mContactId;
    385             }
    386 
    387             int s = score.mPrimaryScore;
    388             if (s == NO_DATA_SCORE) {
    389                 s = score.mSecondaryScore;
    390             }
    391 
    392             if (s >= threshold && s > maxScore) {
    393                 contactId = score.mContactId;
    394                 maxScore = s;
    395             }
    396         }
    397         return contactId;
    398     }
    399 
    400     /**
    401      * Returns matches in the order of descending score.
    402      */
    403     public List<MatchScore> pickBestMatches(int threshold) {
    404         int scaledThreshold = threshold * SCORE_SCALE;
    405         List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
    406         Collections.sort(matches);
    407         int count = 0;
    408         for (int i = 0; i < mScoreCount; i++) {
    409             MatchScore matchScore = matches.get(i);
    410             if (matchScore.getScore() >= scaledThreshold) {
    411                 count++;
    412             } else {
    413                 break;
    414             }
    415         }
    416 
    417         return matches.subList(0, count);
    418     }
    419 
    420     @Override
    421     public String toString() {
    422         return mScoreList.toString();
    423     }
    424 }
    425