Home | History | Annotate | Download | only in contacts
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License
     15  */
     16 package com.android.providers.contacts;
     17 
     18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
     19 
     20 import java.util.ArrayList;
     21 import java.util.Collections;
     22 import java.util.HashMap;
     23 import java.util.List;
     24 
     25 /**
     26  * Logic for matching contacts' data and accumulating match scores.
     27  */
     28 public class ContactMatcher {
     29 
     30     // Best possible match score
     31     public static final int MAX_SCORE = 100;
     32 
     33     // Suggest to aggregate contacts if their match score is equal or greater than this threshold
     34     public static final int SCORE_THRESHOLD_SUGGEST = 50;
     35 
     36     // Automatically aggregate contacts if their match score is equal or greater than this threshold
     37     public static final int SCORE_THRESHOLD_PRIMARY = 70;
     38 
     39     // Automatically aggregate contacts if the match score is equal or greater than this threshold
     40     // and there is a secondary match (phone number, email etc).
     41     public static final int SCORE_THRESHOLD_SECONDARY = 50;
     42 
     43     // Score for missing data (as opposed to present data but a bad match)
     44     private static final int NO_DATA_SCORE = -1;
     45 
     46     // Score for matching phone numbers
     47     private static final int PHONE_MATCH_SCORE = 71;
     48 
     49     // Score for matching email addresses
     50     private static final int EMAIL_MATCH_SCORE = 71;
     51 
     52     // Score for matching nickname
     53     private static final int NICKNAME_MATCH_SCORE = 71;
     54 
     55     // Maximum number of characters in a name to be considered by the matching algorithm.
     56     private static final int MAX_MATCHED_NAME_LENGTH = 30;
     57 
     58     // Scores a multiplied by this number to allow room for "fractional" scores
     59     private static final int SCORE_SCALE = 1000;
     60 
     61     public static final int MATCHING_ALGORITHM_EXACT = 0;
     62     public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
     63     public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
     64 
     65     // Minimum edit distance between two names to be considered an approximate match
     66     public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
     67 
     68     // Minimum edit distance between two email ids to be considered an approximate match
     69     public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
     70 
     71     // Returned value when we found multiple matches and that was not allowed
     72     public static final long MULTIPLE_MATCHES = -2;
     73 
     74     /**
     75      * Name matching scores: a matrix by name type vs. candidate lookup type.
     76      * For example, if the name type is "full name" while we are looking for a
     77      * "full name", the score may be 99. If we are looking for a "nickname" but
     78      * find "first name", the score may be 50 (see specific scores defined
     79      * below.)
     80      * <p>
     81      * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
     82      * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
     83      * match producing the score of 70.  The score may also be 0 if the similarity (distance)
     84      * between the strings is below the threshold.
     85      * <p>
     86      * We use a string matching algorithm, which is particularly suited for
     87      * name matching. See {@link NameDistance}.
     88      */
     89     private static int[] sMinScore =
     90             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
     91     private static int[] sMaxScore =
     92             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
     93 
     94     /*
     95      * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
     96      * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
     97      * not!  They are useful in three-way aggregation cases when we have, for example, both
     98      * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
     99      * with the former rather than the latter.  This is why "reverse" matches have slightly lower
    100      * scores than direct matches.
    101      */
    102     static {
    103         setScoreRange(NameLookupType.NAME_EXACT,
    104                 NameLookupType.NAME_EXACT, 99, 99);
    105         setScoreRange(NameLookupType.NAME_VARIANT,
    106                 NameLookupType.NAME_VARIANT, 90, 90);
    107         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    108                 NameLookupType.NAME_COLLATION_KEY, 50, 80);
    109 
    110         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    111                 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
    112         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    113                 NameLookupType.NICKNAME, 50, 60);
    114 
    115         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    116                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
    117         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    118                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
    119         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    120                 NameLookupType.NICKNAME, 50, 60);
    121 
    122         setScoreRange(NameLookupType.NICKNAME,
    123                 NameLookupType.NICKNAME, 50, 60);
    124         setScoreRange(NameLookupType.NICKNAME,
    125                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
    126         setScoreRange(NameLookupType.NICKNAME,
    127                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
    128     }
    129 
    130     /**
    131      * Populates the cells of the score matrix and score span matrix
    132      * corresponding to the {@code candidateNameType} and {@code nameType}.
    133      */
    134     private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) {
    135         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    136         sMinScore[index] = scoreFrom;
    137         sMaxScore[index] = scoreTo;
    138     }
    139 
    140     /**
    141      * Returns the lower range for the match score for the given {@code candidateNameType} and
    142      * {@code nameType}.
    143      */
    144     private static int getMinScore(int candidateNameType, int nameType) {
    145         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    146         return sMinScore[index];
    147     }
    148 
    149     /**
    150      * Returns the upper range for the match score for the given {@code candidateNameType} and
    151      * {@code nameType}.
    152      */
    153     private static int getMaxScore(int candidateNameType, int nameType) {
    154         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    155         return sMaxScore[index];
    156     }
    157 
    158     /**
    159      * Captures the max score and match count for a specific contact.  Used in an
    160      * contactId - MatchScore map.
    161      */
    162     public static class MatchScore implements Comparable<MatchScore> {
    163         private long mContactId;
    164         private boolean mKeepIn;
    165         private boolean mKeepOut;
    166         private int mPrimaryScore;
    167         private int mSecondaryScore;
    168         private int mMatchCount;
    169 
    170         public MatchScore(long contactId) {
    171             this.mContactId = contactId;
    172         }
    173 
    174         public void reset(long contactId) {
    175             this.mContactId = contactId;
    176             mKeepIn = false;
    177             mKeepOut = false;
    178             mPrimaryScore = 0;
    179             mSecondaryScore = 0;
    180             mMatchCount = 0;
    181         }
    182 
    183         public long getContactId() {
    184             return mContactId;
    185         }
    186 
    187         public void updatePrimaryScore(int score) {
    188             if (score > mPrimaryScore) {
    189                 mPrimaryScore = score;
    190             }
    191             mMatchCount++;
    192         }
    193 
    194         public void updateSecondaryScore(int score) {
    195             if (score > mSecondaryScore) {
    196                 mSecondaryScore = score;
    197             }
    198             mMatchCount++;
    199         }
    200 
    201         public void keepIn() {
    202             mKeepIn = true;
    203         }
    204 
    205         public void keepOut() {
    206             mKeepOut = true;
    207         }
    208 
    209         public int getScore() {
    210             if (mKeepOut) {
    211                 return 0;
    212             }
    213 
    214             if (mKeepIn) {
    215                 return MAX_SCORE;
    216             }
    217 
    218             int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore);
    219 
    220             // Ensure that of two contacts with the same match score the one with more matching
    221             // data elements wins.
    222             return score * SCORE_SCALE + mMatchCount;
    223         }
    224 
    225         /**
    226          * Descending order of match score.
    227          */
    228         public int compareTo(MatchScore another) {
    229             return another.getScore() - getScore();
    230         }
    231 
    232         @Override
    233         public String toString() {
    234             return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount
    235                     + ")";
    236         }
    237     }
    238 
    239     private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>();
    240     private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
    241     private int mScoreCount = 0;
    242 
    243     private final NameDistance mNameDistanceConservative = new NameDistance();
    244     private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
    245 
    246     private MatchScore getMatchingScore(long contactId) {
    247         MatchScore matchingScore = mScores.get(contactId);
    248         if (matchingScore == null) {
    249             if (mScoreList.size() > mScoreCount) {
    250                 matchingScore = mScoreList.get(mScoreCount);
    251                 matchingScore.reset(contactId);
    252             } else {
    253                 matchingScore = new MatchScore(contactId);
    254                 mScoreList.add(matchingScore);
    255             }
    256             mScoreCount++;
    257             mScores.put(contactId, matchingScore);
    258         }
    259         return matchingScore;
    260     }
    261 
    262     /**
    263      * Checks if there is a match and updates the overall score for the
    264      * specified contact for a discovered match. The new score is determined
    265      * by the prior score, by the type of name we were looking for, the type
    266      * of name we found and, if the match is approximate, the distance between the candidate and
    267      * actual name.
    268      */
    269     public void matchName(long contactId, int candidateNameType, String candidateName,
    270             int nameType, String name, int algorithm) {
    271         int maxScore = getMaxScore(candidateNameType, nameType);
    272         if (maxScore == 0) {
    273             return;
    274         }
    275 
    276         if (candidateName.equals(name)) {
    277             updatePrimaryScore(contactId, maxScore);
    278             return;
    279         }
    280 
    281         if (algorithm == MATCHING_ALGORITHM_EXACT) {
    282             return;
    283         }
    284 
    285         int minScore = getMinScore(candidateNameType, nameType);
    286         if (minScore == maxScore) {
    287             return;
    288         }
    289 
    290         byte[] decodedCandidateName = Hex.decodeHex(candidateName);
    291         byte[] decodedName = Hex.decodeHex(name);
    292 
    293         NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
    294                 mNameDistanceConservative : mNameDistanceApproximate;
    295 
    296         int score;
    297         float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
    298         boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
    299                 || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
    300         float threshold = emailBased
    301                 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
    302                 : APPROXIMATE_MATCH_THRESHOLD;
    303         if (distance > threshold) {
    304             score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
    305         } else {
    306             score = 0;
    307         }
    308 
    309         updatePrimaryScore(contactId, score);
    310     }
    311 
    312     public void updateScoreWithPhoneNumberMatch(long contactId) {
    313         updateSecondaryScore(contactId, PHONE_MATCH_SCORE);
    314     }
    315 
    316     public void updateScoreWithEmailMatch(long contactId) {
    317         updateSecondaryScore(contactId, EMAIL_MATCH_SCORE);
    318     }
    319 
    320     public void updateScoreWithNicknameMatch(long contactId) {
    321         updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE);
    322     }
    323 
    324     private void updatePrimaryScore(long contactId, int score) {
    325         getMatchingScore(contactId).updatePrimaryScore(score);
    326     }
    327 
    328     private void updateSecondaryScore(long contactId, int score) {
    329         getMatchingScore(contactId).updateSecondaryScore(score);
    330     }
    331 
    332     public void keepIn(long contactId) {
    333         getMatchingScore(contactId).keepIn();
    334     }
    335 
    336     public void keepOut(long contactId) {
    337         getMatchingScore(contactId).keepOut();
    338     }
    339 
    340     public void clear() {
    341         mScores.clear();
    342         mScoreCount = 0;
    343     }
    344 
    345     /**
    346      * Returns a list of IDs for contacts that are matched on secondary data elements
    347      * (phone number, email address, nickname). We still need to obtain the approximate
    348      * primary score for those contacts to determine if any of them should be aggregated.
    349      * <p>
    350      * May return null.
    351      */
    352     public List<Long> prepareSecondaryMatchCandidates(int threshold) {
    353         ArrayList<Long> contactIds = null;
    354 
    355         for (int i = 0; i < mScoreCount; i++) {
    356             MatchScore score = mScoreList.get(i);
    357             if (score.mKeepOut) {
    358                 continue;
    359             }
    360 
    361             int s = score.mSecondaryScore;
    362             if (s >= threshold) {
    363                 if (contactIds == null) {
    364                     contactIds = new ArrayList<Long>();
    365                 }
    366                 contactIds.add(score.mContactId);
    367             }
    368             score.mPrimaryScore = NO_DATA_SCORE;
    369         }
    370         return contactIds;
    371     }
    372 
    373     /**
    374      * Returns the contactId with the best match score over the specified threshold or -1
    375      * if no such contact is found.
    376      */
    377     public long pickBestMatch(int threshold, boolean allowMultipleMatches) {
    378         long contactId = -1;
    379         int maxScore = 0;
    380         for (int i = 0; i < mScoreCount; i++) {
    381             MatchScore score = mScoreList.get(i);
    382             if (score.mKeepOut) {
    383                 continue;
    384             }
    385 
    386             if (score.mKeepIn) {
    387                 return score.mContactId;
    388             }
    389 
    390             int s = score.mPrimaryScore;
    391             if (s == NO_DATA_SCORE) {
    392                 s = score.mSecondaryScore;
    393             }
    394 
    395             if (s >= threshold) {
    396                 if (contactId != -1 && !allowMultipleMatches) {
    397                     return MULTIPLE_MATCHES;
    398                 }
    399                 if (s > maxScore) {
    400                     contactId = score.mContactId;
    401                     maxScore = s;
    402                 }
    403             }
    404         }
    405         return contactId;
    406     }
    407 
    408     /**
    409      * Returns matches in the order of descending score.
    410      */
    411     public List<MatchScore> pickBestMatches(int threshold) {
    412         int scaledThreshold = threshold * SCORE_SCALE;
    413         List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
    414         Collections.sort(matches);
    415         int count = 0;
    416         for (int i = 0; i < mScoreCount; i++) {
    417             MatchScore matchScore = matches.get(i);
    418             if (matchScore.getScore() >= scaledThreshold) {
    419                 count++;
    420             } else {
    421                 break;
    422             }
    423         }
    424 
    425         return matches.subList(0, count);
    426     }
    427 
    428     @Override
    429     public String toString() {
    430         return mScoreList.subList(0, mScoreCount).toString();
    431     }
    432 }
    433