Home | History | Annotate | Download | only in contacts
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License
     15  */
     16 package com.android.providers.contacts;
     17 
     18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
     19 
     20 import java.util.ArrayList;
     21 import java.util.Collections;
     22 import java.util.HashMap;
     23 import java.util.List;
     24 
     25 /**
     26  * Logic for matching contacts' data and accumulating match scores.
     27  */
     28 public class ContactMatcher {
     29 
     30     // Best possible match score
     31     public static final int MAX_SCORE = 100;
     32 
     33     // Suggest to aggregate contacts if their match score is equal or greater than this threshold
     34     public static final int SCORE_THRESHOLD_SUGGEST = 50;
     35 
     36     // Automatically aggregate contacts if their match score is equal or greater than this threshold
     37     public static final int SCORE_THRESHOLD_PRIMARY = 70;
     38 
     39     // Automatically aggregate contacts if the match score is equal or greater than this threshold
     40     // and there is a secondary match (phone number, email etc).
     41     public static final int SCORE_THRESHOLD_SECONDARY = 50;
     42 
     43     // Score for missing data (as opposed to present data but a bad match)
     44     private static final int NO_DATA_SCORE = -1;
     45 
     46     // Score for matching phone numbers
     47     private static final int PHONE_MATCH_SCORE = 71;
     48 
     49     // Score for matching email addresses
     50     private static final int EMAIL_MATCH_SCORE = 71;
     51 
     52     // Score for matching nickname
     53     private static final int NICKNAME_MATCH_SCORE = 71;
     54 
     55     // Maximum number of characters in a name to be considered by the matching algorithm.
     56     private static final int MAX_MATCHED_NAME_LENGTH = 30;
     57 
     58     // Scores a multiplied by this number to allow room for "fractional" scores
     59     private static final int SCORE_SCALE = 1000;
     60 
     61     public static final int MATCHING_ALGORITHM_EXACT = 0;
     62     public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
     63     public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
     64 
     65     // Minimum edit distance between two names to be considered an approximate match
     66     public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
     67 
     68     // Minimum edit distance between two email ids to be considered an approximate match
     69     public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
     70 
     71     // Returned value when we found multiple matches and that was not allowed
     72     public static final long MULTIPLE_MATCHES = -2;
     73 
     74     /**
     75      * Name matching scores: a matrix by name type vs. candidate lookup type.
     76      * For example, if the name type is "full name" while we are looking for a
     77      * "full name", the score may be 99. If we are looking for a "nickname" but
     78      * find "first name", the score may be 50 (see specific scores defined
     79      * below.)
     80      * <p>
     81      * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
     82      * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
     83      * match producing the score of 70.  The score may also be 0 if the similarity (distance)
     84      * between the strings is below the threshold.
     85      * <p>
     86      * We use a string matching algorithm, which is particularly suited for
     87      * name matching. See {@link NameDistance}.
     88      */
     89     private static int[] sMinScore =
     90             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
     91     private static int[] sMaxScore =
     92             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
     93 
     94     /*
     95      * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
     96      * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
     97      * not!  They are useful in three-way aggregation cases when we have, for example, both
     98      * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
     99      * with the former rather than the latter.  This is why "reverse" matches have slightly lower
    100      * scores than direct matches.
    101      */
    102     static {
    103         setScoreRange(NameLookupType.NAME_EXACT,
    104                 NameLookupType.NAME_EXACT, 99, 99);
    105         setScoreRange(NameLookupType.NAME_VARIANT,
    106                 NameLookupType.NAME_VARIANT, 90, 90);
    107         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    108                 NameLookupType.NAME_COLLATION_KEY, 50, 80);
    109 
    110         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    111                 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
    112         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    113                 NameLookupType.NICKNAME, 50, 60);
    114 
    115         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    116                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
    117         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    118                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
    119         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    120                 NameLookupType.NICKNAME, 50, 60);
    121 
    122         setScoreRange(NameLookupType.NICKNAME,
    123                 NameLookupType.NICKNAME, 50, 60);
    124         setScoreRange(NameLookupType.NICKNAME,
    125                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
    126         setScoreRange(NameLookupType.NICKNAME,
    127                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
    128     }
    129 
    130     /**
    131      * Populates the cells of the score matrix and score span matrix
    132      * corresponding to the {@code candidateNameType} and {@code nameType}.
    133      */
    134     private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) {
    135         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    136         sMinScore[index] = scoreFrom;
    137         sMaxScore[index] = scoreTo;
    138     }
    139 
    140     /**
    141      * Returns the lower range for the match score for the given {@code candidateNameType} and
    142      * {@code nameType}.
    143      */
    144     private static int getMinScore(int candidateNameType, int nameType) {
    145         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    146         return sMinScore[index];
    147     }
    148 
    149     /**
    150      * Returns the upper range for the match score for the given {@code candidateNameType} and
    151      * {@code nameType}.
    152      */
    153     private static int getMaxScore(int candidateNameType, int nameType) {
    154         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    155         return sMaxScore[index];
    156     }
    157 
    158     /**
    159      * Captures the max score and match count for a specific contact.  Used in an
    160      * contactId - MatchScore map.
    161      */
    162     public static class MatchScore implements Comparable<MatchScore> {
    163         private long mContactId;
    164         private boolean mKeepIn;
    165         private boolean mKeepOut;
    166         private int mPrimaryScore;
    167         private int mSecondaryScore;
    168         private int mMatchCount;
    169 
    170         public MatchScore(long contactId) {
    171             this.mContactId = contactId;
    172         }
    173 
    174         public void reset(long contactId) {
    175             this.mContactId = contactId;
    176             mKeepIn = false;
    177             mKeepOut = false;
    178             mPrimaryScore = 0;
    179             mSecondaryScore = 0;
    180             mMatchCount = 0;
    181         }
    182 
    183         public long getContactId() {
    184             return mContactId;
    185         }
    186 
    187         public void updatePrimaryScore(int score) {
    188             if (score > mPrimaryScore) {
    189                 mPrimaryScore = score;
    190             }
    191             mMatchCount++;
    192         }
    193 
    194         public void updateSecondaryScore(int score) {
    195             if (score > mSecondaryScore) {
    196                 mSecondaryScore = score;
    197             }
    198             mMatchCount++;
    199         }
    200 
    201         public void keepIn() {
    202             mKeepIn = true;
    203         }
    204 
    205         public void keepOut() {
    206             mKeepOut = true;
    207         }
    208 
    209         public int getScore() {
    210             if (mKeepOut) {
    211                 return 0;
    212             }
    213 
    214             if (mKeepIn) {
    215                 return MAX_SCORE;
    216             }
    217 
    218             int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore);
    219 
    220             // Ensure that of two contacts with the same match score the one with more matching
    221             // data elements wins.
    222             return score * SCORE_SCALE + mMatchCount;
    223         }
    224 
    225         /**
    226          * Descending order of match score.
    227          */
    228         @Override
    229         public int compareTo(MatchScore another) {
    230             return another.getScore() - getScore();
    231         }
    232 
    233         @Override
    234         public String toString() {
    235             return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount
    236                     + ")";
    237         }
    238     }
    239 
    240     private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>();
    241     private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
    242     private int mScoreCount = 0;
    243 
    244     private final NameDistance mNameDistanceConservative = new NameDistance();
    245     private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
    246 
    247     private MatchScore getMatchingScore(long contactId) {
    248         MatchScore matchingScore = mScores.get(contactId);
    249         if (matchingScore == null) {
    250             if (mScoreList.size() > mScoreCount) {
    251                 matchingScore = mScoreList.get(mScoreCount);
    252                 matchingScore.reset(contactId);
    253             } else {
    254                 matchingScore = new MatchScore(contactId);
    255                 mScoreList.add(matchingScore);
    256             }
    257             mScoreCount++;
    258             mScores.put(contactId, matchingScore);
    259         }
    260         return matchingScore;
    261     }
    262 
    263     /**
    264      * Marks the contact as a full match, because we found an Identity match
    265      */
    266     public void matchIdentity(long contactId) {
    267         updatePrimaryScore(contactId, MAX_SCORE);
    268     }
    269 
    270     /**
    271      * Checks if there is a match and updates the overall score for the
    272      * specified contact for a discovered match. The new score is determined
    273      * by the prior score, by the type of name we were looking for, the type
    274      * of name we found and, if the match is approximate, the distance between the candidate and
    275      * actual name.
    276      */
    277     public void matchName(long contactId, int candidateNameType, String candidateName,
    278             int nameType, String name, int algorithm) {
    279         int maxScore = getMaxScore(candidateNameType, nameType);
    280         if (maxScore == 0) {
    281             return;
    282         }
    283 
    284         if (candidateName.equals(name)) {
    285             updatePrimaryScore(contactId, maxScore);
    286             return;
    287         }
    288 
    289         if (algorithm == MATCHING_ALGORITHM_EXACT) {
    290             return;
    291         }
    292 
    293         int minScore = getMinScore(candidateNameType, nameType);
    294         if (minScore == maxScore) {
    295             return;
    296         }
    297 
    298         byte[] decodedCandidateName = Hex.decodeHex(candidateName);
    299         byte[] decodedName = Hex.decodeHex(name);
    300 
    301         NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
    302                 mNameDistanceConservative : mNameDistanceApproximate;
    303 
    304         int score;
    305         float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
    306         boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
    307                 || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
    308         float threshold = emailBased
    309                 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
    310                 : APPROXIMATE_MATCH_THRESHOLD;
    311         if (distance > threshold) {
    312             score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
    313         } else {
    314             score = 0;
    315         }
    316 
    317         updatePrimaryScore(contactId, score);
    318     }
    319 
    320     public void updateScoreWithPhoneNumberMatch(long contactId) {
    321         updateSecondaryScore(contactId, PHONE_MATCH_SCORE);
    322     }
    323 
    324     public void updateScoreWithEmailMatch(long contactId) {
    325         updateSecondaryScore(contactId, EMAIL_MATCH_SCORE);
    326     }
    327 
    328     public void updateScoreWithNicknameMatch(long contactId) {
    329         updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE);
    330     }
    331 
    332     private void updatePrimaryScore(long contactId, int score) {
    333         getMatchingScore(contactId).updatePrimaryScore(score);
    334     }
    335 
    336     private void updateSecondaryScore(long contactId, int score) {
    337         getMatchingScore(contactId).updateSecondaryScore(score);
    338     }
    339 
    340     public void keepIn(long contactId) {
    341         getMatchingScore(contactId).keepIn();
    342     }
    343 
    344     public void keepOut(long contactId) {
    345         getMatchingScore(contactId).keepOut();
    346     }
    347 
    348     public void clear() {
    349         mScores.clear();
    350         mScoreCount = 0;
    351     }
    352 
    353     /**
    354      * Returns a list of IDs for contacts that are matched on secondary data elements
    355      * (phone number, email address, nickname). We still need to obtain the approximate
    356      * primary score for those contacts to determine if any of them should be aggregated.
    357      * <p>
    358      * May return null.
    359      */
    360     public List<Long> prepareSecondaryMatchCandidates(int threshold) {
    361         ArrayList<Long> contactIds = null;
    362 
    363         for (int i = 0; i < mScoreCount; i++) {
    364             MatchScore score = mScoreList.get(i);
    365             if (score.mKeepOut) {
    366                 continue;
    367             }
    368 
    369             int s = score.mSecondaryScore;
    370             if (s >= threshold) {
    371                 if (contactIds == null) {
    372                     contactIds = new ArrayList<Long>();
    373                 }
    374                 contactIds.add(score.mContactId);
    375             }
    376             score.mPrimaryScore = NO_DATA_SCORE;
    377         }
    378         return contactIds;
    379     }
    380 
    381     /**
    382      * Returns the contactId with the best match score over the specified threshold or -1
    383      * if no such contact is found.
    384      */
    385     public long pickBestMatch(int threshold, boolean allowMultipleMatches) {
    386         long contactId = -1;
    387         int maxScore = 0;
    388         for (int i = 0; i < mScoreCount; i++) {
    389             MatchScore score = mScoreList.get(i);
    390             if (score.mKeepOut) {
    391                 continue;
    392             }
    393 
    394             if (score.mKeepIn) {
    395                 return score.mContactId;
    396             }
    397 
    398             int s = score.mPrimaryScore;
    399             if (s == NO_DATA_SCORE) {
    400                 s = score.mSecondaryScore;
    401             }
    402 
    403             if (s >= threshold) {
    404                 if (contactId != -1 && !allowMultipleMatches) {
    405                     return MULTIPLE_MATCHES;
    406                 }
    407                 if (s > maxScore) {
    408                     contactId = score.mContactId;
    409                     maxScore = s;
    410                 }
    411             }
    412         }
    413         return contactId;
    414     }
    415 
    416     /**
    417      * Returns matches in the order of descending score.
    418      */
    419     public List<MatchScore> pickBestMatches(int threshold) {
    420         int scaledThreshold = threshold * SCORE_SCALE;
    421         List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
    422         Collections.sort(matches);
    423         int count = 0;
    424         for (int i = 0; i < mScoreCount; i++) {
    425             MatchScore matchScore = matches.get(i);
    426             if (matchScore.getScore() >= scaledThreshold) {
    427                 count++;
    428             } else {
    429                 break;
    430             }
    431         }
    432 
    433         return matches.subList(0, count);
    434     }
    435 
    436     @Override
    437     public String toString() {
    438         return mScoreList.subList(0, mScoreCount).toString();
    439     }
    440 }
    441