Home | History | Annotate | Download | only in util
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License
     15  */
     16 package com.android.providers.contacts.aggregation.util;
     17 
     18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
     19 import com.android.providers.contacts.util.Hex;
     20 
     21 import android.util.Log;
     22 
     23 import java.util.ArrayList;
     24 import java.util.Collections;
     25 import java.util.HashMap;
     26 import java.util.List;
     27 
     28 /**
     29  * Logic for matching contacts' data and accumulating match scores.
     30  */
     31 public class ContactMatcher {
     32     private static final String TAG = "ContactMatcher";
     33 
     34     // Best possible match score
     35     public static final int MAX_SCORE = 100;
     36 
     37     // Suggest to aggregate contacts if their match score is equal or greater than this threshold
     38     public static final int SCORE_THRESHOLD_SUGGEST = 50;
     39 
     40     // Automatically aggregate contacts if their match score is equal or greater than this threshold
     41     public static final int SCORE_THRESHOLD_PRIMARY = 70;
     42 
     43     // Automatically aggregate contacts if the match score is equal or greater than this threshold
     44     // and there is a secondary match (phone number, email etc).
     45     public static final int SCORE_THRESHOLD_SECONDARY = 50;
     46 
     47     // Score for missing data (as opposed to present data but a bad match)
     48     private static final int NO_DATA_SCORE = -1;
     49 
     50     // Score for matching phone numbers
     51     private static final int PHONE_MATCH_SCORE = 71;
     52 
     53     // Score for matching email addresses
     54     private static final int EMAIL_MATCH_SCORE = 71;
     55 
     56     // Score for matching nickname
     57     private static final int NICKNAME_MATCH_SCORE = 71;
     58 
     59     // Maximum number of characters in a name to be considered by the matching algorithm.
     60     private static final int MAX_MATCHED_NAME_LENGTH = 30;
     61 
     62     // Scores a multiplied by this number to allow room for "fractional" scores
     63     private static final int SCORE_SCALE = 1000;
     64 
     65     public static final int MATCHING_ALGORITHM_EXACT = 0;
     66     public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
     67     public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
     68 
     69     // Minimum edit distance between two names to be considered an approximate match
     70     public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
     71 
     72     // Minimum edit distance between two email ids to be considered an approximate match
     73     public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
     74 
     75     // Returned value when we found multiple matches and that was not allowed
     76     public static final long MULTIPLE_MATCHES = -2;
     77 
     78     /**
     79      * Name matching scores: a matrix by name type vs. candidate lookup type.
     80      * For example, if the name type is "full name" while we are looking for a
     81      * "full name", the score may be 99. If we are looking for a "nickname" but
     82      * find "first name", the score may be 50 (see specific scores defined
     83      * below.)
     84      * <p>
     85      * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
     86      * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
     87      * match producing the score of 70.  The score may also be 0 if the similarity (distance)
     88      * between the strings is below the threshold.
     89      * <p>
     90      * We use a string matching algorithm, which is particularly suited for
     91      * name matching. See {@link NameDistance}.
     92      */
     93     private static int[] sMinScore =
     94             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
     95     private static int[] sMaxScore =
     96             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
     97 
     98     /*
     99      * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
    100      * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
    101      * not!  They are useful in three-way aggregation cases when we have, for example, both
    102      * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
    103      * with the former rather than the latter.  This is why "reverse" matches have slightly lower
    104      * scores than direct matches.
    105      */
    106     static {
    107         setScoreRange(NameLookupType.NAME_EXACT,
    108                 NameLookupType.NAME_EXACT, 99, 99);
    109         setScoreRange(NameLookupType.NAME_VARIANT,
    110                 NameLookupType.NAME_VARIANT, 90, 90);
    111         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    112                 NameLookupType.NAME_COLLATION_KEY, 50, 80);
    113 
    114         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    115                 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
    116         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    117                 NameLookupType.NICKNAME, 50, 60);
    118 
    119         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    120                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
    121         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    122                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
    123         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    124                 NameLookupType.NICKNAME, 50, 60);
    125 
    126         setScoreRange(NameLookupType.NICKNAME,
    127                 NameLookupType.NICKNAME, 50, 60);
    128         setScoreRange(NameLookupType.NICKNAME,
    129                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
    130         setScoreRange(NameLookupType.NICKNAME,
    131                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
    132     }
    133 
    134     /**
    135      * Populates the cells of the score matrix and score span matrix
    136      * corresponding to the {@code candidateNameType} and {@code nameType}.
    137      */
    138     private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) {
    139         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    140         sMinScore[index] = scoreFrom;
    141         sMaxScore[index] = scoreTo;
    142     }
    143 
    144     /**
    145      * Returns the lower range for the match score for the given {@code candidateNameType} and
    146      * {@code nameType}.
    147      */
    148     private static int getMinScore(int candidateNameType, int nameType) {
    149         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    150         return sMinScore[index];
    151     }
    152 
    153     /**
    154      * Returns the upper range for the match score for the given {@code candidateNameType} and
    155      * {@code nameType}.
    156      */
    157     private static int getMaxScore(int candidateNameType, int nameType) {
    158         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    159         return sMaxScore[index];
    160     }
    161 
    162     /**
    163      * Captures the max score and match count for a specific contact.  Used in an
    164      * contactId - MatchScore map.
    165      */
    166     public static class MatchScore implements Comparable<MatchScore> {
    167         private long mContactId;
    168         private boolean mKeepIn;
    169         private boolean mKeepOut;
    170         private int mPrimaryScore;
    171         private int mSecondaryScore;
    172         private int mMatchCount;
    173 
    174         public MatchScore(long contactId) {
    175             this.mContactId = contactId;
    176         }
    177 
    178         public void reset(long contactId) {
    179             this.mContactId = contactId;
    180             mKeepIn = false;
    181             mKeepOut = false;
    182             mPrimaryScore = 0;
    183             mSecondaryScore = 0;
    184             mMatchCount = 0;
    185         }
    186 
    187         public long getContactId() {
    188             return mContactId;
    189         }
    190 
    191         public void updatePrimaryScore(int score) {
    192             if (score > mPrimaryScore) {
    193                 mPrimaryScore = score;
    194             }
    195             mMatchCount++;
    196         }
    197 
    198         public void updateSecondaryScore(int score) {
    199             if (score > mSecondaryScore) {
    200                 mSecondaryScore = score;
    201             }
    202             mMatchCount++;
    203         }
    204 
    205         public void keepIn() {
    206             mKeepIn = true;
    207         }
    208 
    209         public void keepOut() {
    210             mKeepOut = true;
    211         }
    212 
    213         public int getScore() {
    214             if (mKeepOut) {
    215                 return 0;
    216             }
    217 
    218             if (mKeepIn) {
    219                 return MAX_SCORE;
    220             }
    221 
    222             int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore);
    223 
    224             // Ensure that of two contacts with the same match score the one with more matching
    225             // data elements wins.
    226             return score * SCORE_SCALE + mMatchCount;
    227         }
    228 
    229         /**
    230          * Descending order of match score.
    231          */
    232         @Override
    233         public int compareTo(MatchScore another) {
    234             return another.getScore() - getScore();
    235         }
    236 
    237         @Override
    238         public String toString() {
    239             return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount
    240                     + ")";
    241         }
    242     }
    243 
    244     private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>();
    245     private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
    246     private int mScoreCount = 0;
    247 
    248     private final NameDistance mNameDistanceConservative = new NameDistance();
    249     private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
    250 
    251     private MatchScore getMatchingScore(long contactId) {
    252         MatchScore matchingScore = mScores.get(contactId);
    253         if (matchingScore == null) {
    254             if (mScoreList.size() > mScoreCount) {
    255                 matchingScore = mScoreList.get(mScoreCount);
    256                 matchingScore.reset(contactId);
    257             } else {
    258                 matchingScore = new MatchScore(contactId);
    259                 mScoreList.add(matchingScore);
    260             }
    261             mScoreCount++;
    262             mScores.put(contactId, matchingScore);
    263         }
    264         return matchingScore;
    265     }
    266 
    267     /**
    268      * Marks the contact as a full match, because we found an Identity match
    269      */
    270     public void matchIdentity(long contactId) {
    271         updatePrimaryScore(contactId, MAX_SCORE);
    272     }
    273 
    274     /**
    275      * Checks if there is a match and updates the overall score for the
    276      * specified contact for a discovered match. The new score is determined
    277      * by the prior score, by the type of name we were looking for, the type
    278      * of name we found and, if the match is approximate, the distance between the candidate and
    279      * actual name.
    280      */
    281     public void matchName(long contactId, int candidateNameType, String candidateName,
    282             int nameType, String name, int algorithm) {
    283         int maxScore = getMaxScore(candidateNameType, nameType);
    284         if (maxScore == 0) {
    285             return;
    286         }
    287 
    288         if (candidateName.equals(name)) {
    289             updatePrimaryScore(contactId, maxScore);
    290             return;
    291         }
    292 
    293         if (algorithm == MATCHING_ALGORITHM_EXACT) {
    294             return;
    295         }
    296 
    297         int minScore = getMinScore(candidateNameType, nameType);
    298         if (minScore == maxScore) {
    299             return;
    300         }
    301 
    302         final byte[] decodedCandidateName;
    303         final byte[] decodedName;
    304         try {
    305             decodedCandidateName = Hex.decodeHex(candidateName);
    306             decodedName = Hex.decodeHex(name);
    307         } catch (RuntimeException e) {
    308             // How could this happen??  See bug 6827136
    309             Log.e(TAG, "Failed to decode normalized name.  Skipping.", e);
    310             return;
    311         }
    312 
    313         NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
    314                 mNameDistanceConservative : mNameDistanceApproximate;
    315 
    316         int score;
    317         float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
    318         boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
    319                 || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
    320         float threshold = emailBased
    321                 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
    322                 : APPROXIMATE_MATCH_THRESHOLD;
    323         if (distance > threshold) {
    324             score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
    325         } else {
    326             score = 0;
    327         }
    328 
    329         updatePrimaryScore(contactId, score);
    330     }
    331 
    332     public void updateScoreWithPhoneNumberMatch(long contactId) {
    333         updateSecondaryScore(contactId, PHONE_MATCH_SCORE);
    334     }
    335 
    336     public void updateScoreWithEmailMatch(long contactId) {
    337         updateSecondaryScore(contactId, EMAIL_MATCH_SCORE);
    338     }
    339 
    340     public void updateScoreWithNicknameMatch(long contactId) {
    341         updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE);
    342     }
    343 
    344     private void updatePrimaryScore(long contactId, int score) {
    345         getMatchingScore(contactId).updatePrimaryScore(score);
    346     }
    347 
    348     private void updateSecondaryScore(long contactId, int score) {
    349         getMatchingScore(contactId).updateSecondaryScore(score);
    350     }
    351 
    352     public void keepIn(long contactId) {
    353         getMatchingScore(contactId).keepIn();
    354     }
    355 
    356     public void keepOut(long contactId) {
    357         getMatchingScore(contactId).keepOut();
    358     }
    359 
    360     public void clear() {
    361         mScores.clear();
    362         mScoreCount = 0;
    363     }
    364 
    365     /**
    366      * Returns a list of IDs for contacts that are matched on secondary data elements
    367      * (phone number, email address, nickname). We still need to obtain the approximate
    368      * primary score for those contacts to determine if any of them should be aggregated.
    369      * <p>
    370      * May return null.
    371      */
    372     public List<Long> prepareSecondaryMatchCandidates(int threshold) {
    373         ArrayList<Long> contactIds = null;
    374 
    375         for (int i = 0; i < mScoreCount; i++) {
    376             MatchScore score = mScoreList.get(i);
    377             if (score.mKeepOut) {
    378                 continue;
    379             }
    380 
    381             int s = score.mSecondaryScore;
    382             if (s >= threshold) {
    383                 if (contactIds == null) {
    384                     contactIds = new ArrayList<Long>();
    385                 }
    386                 contactIds.add(score.mContactId);
    387             }
    388             score.mPrimaryScore = NO_DATA_SCORE;
    389         }
    390         return contactIds;
    391     }
    392 
    393     /**
    394      * Returns the contactId with the best match score over the specified threshold or -1
    395      * if no such contact is found.  If multiple contacts are found, and
    396      * {@code allowMultipleMatches} is {@code true}, it returns the first one found, but if
    397      * {@code allowMultipleMatches} is {@code false} it'll return {@link #MULTIPLE_MATCHES}.
    398      */
    399     public long pickBestMatch(int threshold, boolean allowMultipleMatches) {
    400         long contactId = -1;
    401         int maxScore = 0;
    402         for (int i = 0; i < mScoreCount; i++) {
    403             MatchScore score = mScoreList.get(i);
    404             if (score.mKeepOut) {
    405                 continue;
    406             }
    407 
    408             if (score.mKeepIn) {
    409                 return score.mContactId;
    410             }
    411 
    412             int s = score.mPrimaryScore;
    413             if (s == NO_DATA_SCORE) {
    414                 s = score.mSecondaryScore;
    415             }
    416 
    417             if (s >= threshold) {
    418                 if (contactId != -1 && !allowMultipleMatches) {
    419                     return MULTIPLE_MATCHES;
    420                 }
    421                 // In order to make it stable, let's jut pick the one with the lowest ID
    422                 // if multiple candidates are found.
    423                 if ((s > maxScore) || ((s == maxScore) && (contactId > score.mContactId))) {
    424                     contactId = score.mContactId;
    425                     maxScore = s;
    426                 }
    427             }
    428         }
    429         return contactId;
    430     }
    431 
    432     /**
    433      * Returns matches in the order of descending score.
    434      */
    435     public List<MatchScore> pickBestMatches(int threshold) {
    436         int scaledThreshold = threshold * SCORE_SCALE;
    437         List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
    438         Collections.sort(matches);
    439         int count = 0;
    440         for (int i = 0; i < mScoreCount; i++) {
    441             MatchScore matchScore = matches.get(i);
    442             if (matchScore.getScore() >= scaledThreshold) {
    443                 count++;
    444             } else {
    445                 break;
    446             }
    447         }
    448 
    449         return matches.subList(0, count);
    450     }
    451 
    452     @Override
    453     public String toString() {
    454         return mScoreList.subList(0, mScoreCount).toString();
    455     }
    456 }
    457