Home | History | Annotate | Download | only in util
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License
     15  */
     16 package com.android.providers.contacts.aggregation.util;
     17 
     18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
     19 import com.android.providers.contacts.util.Hex;
     20 
     21 import android.util.ArrayMap;
     22 import android.util.Log;
     23 
     24 import java.util.ArrayList;
     25 import java.util.Collections;
     26 import java.util.List;
     27 
     28 /**
     29  * Logic for matching contacts' data and accumulating match scores.
     30  */
     31 public class ContactMatcher {
     32     private static final String TAG = "ContactMatcher";
     33 
     34     // Suggest to aggregate contacts if their match score is equal or greater than this threshold
     35     public static final int SCORE_THRESHOLD_SUGGEST = 50;
     36 
     37     // Automatically aggregate contacts if their match score is equal or greater than this threshold
     38     public static final int SCORE_THRESHOLD_PRIMARY = 70;
     39 
     40     // Automatically aggregate contacts if the match score is equal or greater than this threshold
     41     // and there is a secondary match (phone number, email etc).
     42     public static final int SCORE_THRESHOLD_SECONDARY = 50;
     43 
     44     // Score for missing data (as opposed to present data but a bad match)
     45     private static final int NO_DATA_SCORE = -1;
     46 
     47     // Score for matching phone numbers
     48     private static final int PHONE_MATCH_SCORE = 71;
     49 
     50     // Score for matching email addresses
     51     private static final int EMAIL_MATCH_SCORE = 71;
     52 
     53     // Score for matching nickname
     54     private static final int NICKNAME_MATCH_SCORE = 71;
     55 
     56     // Maximum number of characters in a name to be considered by the matching algorithm.
     57     private static final int MAX_MATCHED_NAME_LENGTH = 30;
     58 
     59     public static final int MATCHING_ALGORITHM_EXACT = 0;
     60     public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
     61     public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
     62 
     63     // Minimum edit distance between two names to be considered an approximate match
     64     public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
     65 
     66     // Minimum edit distance between two email ids to be considered an approximate match
     67     public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
     68 
     69     // Returned value when we found multiple matches and that was not allowed
     70     public static final long MULTIPLE_MATCHES = -2;
     71 
     72     /**
     73      * Name matching scores: a matrix by name type vs. candidate lookup type.
     74      * For example, if the name type is "full name" while we are looking for a
     75      * "full name", the score may be 99. If we are looking for a "nickname" but
     76      * find "first name", the score may be 50 (see specific scores defined
     77      * below.)
     78      * <p>
     79      * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
     80      * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
     81      * match producing the score of 70.  The score may also be 0 if the similarity (distance)
     82      * between the strings is below the threshold.
     83      * <p>
     84      * We use a string matching algorithm, which is particularly suited for
     85      * name matching. See {@link NameDistance}.
     86      */
     87     private static int[] sMinScore =
     88             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
     89     private static int[] sMaxScore =
     90             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
     91 
     92     /*
     93      * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
     94      * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
     95      * not!  They are useful in three-way aggregation cases when we have, for example, both
     96      * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
     97      * with the former rather than the latter.  This is why "reverse" matches have slightly lower
     98      * scores than direct matches.
     99      */
    100     static {
    101         setScoreRange(NameLookupType.NAME_EXACT,
    102                 NameLookupType.NAME_EXACT, 99, 99);
    103         setScoreRange(NameLookupType.NAME_VARIANT,
    104                 NameLookupType.NAME_VARIANT, 90, 90);
    105         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    106                 NameLookupType.NAME_COLLATION_KEY, 50, 80);
    107 
    108         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    109                 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
    110         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    111                 NameLookupType.NICKNAME, 50, 60);
    112 
    113         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    114                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
    115         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    116                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
    117         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    118                 NameLookupType.NICKNAME, 50, 60);
    119 
    120         setScoreRange(NameLookupType.NICKNAME,
    121                 NameLookupType.NICKNAME, 50, 60);
    122         setScoreRange(NameLookupType.NICKNAME,
    123                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
    124         setScoreRange(NameLookupType.NICKNAME,
    125                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
    126     }
    127 
    128     /**
    129      * Populates the cells of the score matrix and score span matrix
    130      * corresponding to the {@code candidateNameType} and {@code nameType}.
    131      */
    132     private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) {
    133         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    134         sMinScore[index] = scoreFrom;
    135         sMaxScore[index] = scoreTo;
    136     }
    137 
    138     /**
    139      * Returns the lower range for the match score for the given {@code candidateNameType} and
    140      * {@code nameType}.
    141      */
    142     private static int getMinScore(int candidateNameType, int nameType) {
    143         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    144         return sMinScore[index];
    145     }
    146 
    147     /**
    148      * Returns the upper range for the match score for the given {@code candidateNameType} and
    149      * {@code nameType}.
    150      */
    151     private static int getMaxScore(int candidateNameType, int nameType) {
    152         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    153         return sMaxScore[index];
    154     }
    155 
    156     private final ArrayMap<Long, MatchScore> mScores = new ArrayMap<>();
    157     private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
    158     private int mScoreCount = 0;
    159 
    160     private final NameDistance mNameDistanceConservative = new NameDistance();
    161     private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
    162 
    163     private MatchScore getMatchingScore(long contactId) {
    164         MatchScore matchingScore = mScores.get(contactId);
    165         if (matchingScore == null) {
    166             if (mScoreList.size() > mScoreCount) {
    167                 matchingScore = mScoreList.get(mScoreCount);
    168                 matchingScore.reset(contactId);
    169             } else {
    170                 matchingScore = new MatchScore(contactId);
    171                 mScoreList.add(matchingScore);
    172             }
    173             mScoreCount++;
    174             mScores.put(contactId, matchingScore);
    175         }
    176         return matchingScore;
    177     }
    178 
    179     /**
    180      * Marks the contact as a full match, because we found an Identity match
    181      */
    182     public void matchIdentity(long contactId) {
    183         updatePrimaryScore(contactId, MatchScore.MAX_SCORE);
    184     }
    185 
    186     /**
    187      * Checks if there is a match and updates the overall score for the
    188      * specified contact for a discovered match. The new score is determined
    189      * by the prior score, by the type of name we were looking for, the type
    190      * of name we found and, if the match is approximate, the distance between the candidate and
    191      * actual name.
    192      */
    193     public void matchName(long contactId, int candidateNameType, String candidateName,
    194             int nameType, String name, int algorithm) {
    195         int maxScore = getMaxScore(candidateNameType, nameType);
    196         if (maxScore == 0) {
    197             return;
    198         }
    199 
    200         if (candidateName.equals(name)) {
    201             updatePrimaryScore(contactId, maxScore);
    202             return;
    203         }
    204 
    205         if (algorithm == MATCHING_ALGORITHM_EXACT) {
    206             return;
    207         }
    208 
    209         int minScore = getMinScore(candidateNameType, nameType);
    210         if (minScore == maxScore) {
    211             return;
    212         }
    213 
    214         final byte[] decodedCandidateName;
    215         final byte[] decodedName;
    216         try {
    217             decodedCandidateName = Hex.decodeHex(candidateName);
    218             decodedName = Hex.decodeHex(name);
    219         } catch (RuntimeException e) {
    220             // How could this happen??  See bug 6827136
    221             Log.e(TAG, "Failed to decode normalized name.  Skipping.", e);
    222             return;
    223         }
    224 
    225         NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
    226                 mNameDistanceConservative : mNameDistanceApproximate;
    227 
    228         int score;
    229         float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
    230         boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
    231                 || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
    232         float threshold = emailBased
    233                 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
    234                 : APPROXIMATE_MATCH_THRESHOLD;
    235         if (distance > threshold) {
    236             score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
    237         } else {
    238             score = 0;
    239         }
    240 
    241         updatePrimaryScore(contactId, score);
    242     }
    243 
    244     public void updateScoreWithPhoneNumberMatch(long contactId) {
    245         updateSecondaryScore(contactId, PHONE_MATCH_SCORE);
    246     }
    247 
    248     public void updateScoreWithEmailMatch(long contactId) {
    249         updateSecondaryScore(contactId, EMAIL_MATCH_SCORE);
    250     }
    251 
    252     public void updateScoreWithNicknameMatch(long contactId) {
    253         updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE);
    254     }
    255 
    256     private void updatePrimaryScore(long contactId, int score) {
    257         getMatchingScore(contactId).updatePrimaryScore(score);
    258     }
    259 
    260     private void updateSecondaryScore(long contactId, int score) {
    261         getMatchingScore(contactId).updateSecondaryScore(score);
    262     }
    263 
    264     public void keepIn(long contactId) {
    265         getMatchingScore(contactId).keepIn();
    266     }
    267 
    268     public void keepOut(long contactId) {
    269         getMatchingScore(contactId).keepOut();
    270     }
    271 
    272     public void clear() {
    273         mScores.clear();
    274         mScoreCount = 0;
    275     }
    276 
    277     /**
    278      * Returns a list of IDs for contacts that are matched on secondary data elements
    279      * (phone number, email address, nickname). We still need to obtain the approximate
    280      * primary score for those contacts to determine if any of them should be aggregated.
    281      * <p>
    282      * May return null.
    283      */
    284     public List<Long> prepareSecondaryMatchCandidates(int threshold) {
    285         ArrayList<Long> contactIds = null;
    286 
    287         for (int i = 0; i < mScoreCount; i++) {
    288             MatchScore score = mScoreList.get(i);
    289             if (score.isKeepOut()) {
    290                 continue;
    291             }
    292 
    293             int s = score.getSecondaryScore();
    294             if (s >= threshold) {
    295                 if (contactIds == null) {
    296                     contactIds = new ArrayList<Long>();
    297                 }
    298                 contactIds.add(score.getContactId());
    299             }
    300             score.setPrimaryScore(NO_DATA_SCORE);
    301         }
    302         return contactIds;
    303     }
    304 
    305     /**
    306      * Returns the contactId with the best match score over the specified threshold or -1
    307      * if no such contact is found.  If multiple contacts are found, and
    308      * {@code allowMultipleMatches} is {@code true}, it returns the first one found, but if
    309      * {@code allowMultipleMatches} is {@code false} it'll return {@link #MULTIPLE_MATCHES}.
    310      */
    311     public long pickBestMatch(int threshold, boolean allowMultipleMatches) {
    312         long contactId = -1;
    313         int maxScore = 0;
    314         for (int i = 0; i < mScoreCount; i++) {
    315             MatchScore score = mScoreList.get(i);
    316             if (score.isKeepOut()) {
    317                 continue;
    318             }
    319 
    320             if (score.isKeepIn()) {
    321                 return score.getContactId();
    322             }
    323 
    324             int s = score.getPrimaryScore();
    325             if (s == NO_DATA_SCORE) {
    326                 s = score.getSecondaryScore();
    327             }
    328 
    329             if (s >= threshold) {
    330                 if (contactId != -1 && !allowMultipleMatches) {
    331                     return MULTIPLE_MATCHES;
    332                 }
    333                 // In order to make it stable, let's jut pick the one with the lowest ID
    334                 // if multiple candidates are found.
    335                 if ((s > maxScore) || ((s == maxScore) && (contactId > score.getContactId()))) {
    336                     contactId = score.getContactId();
    337                     maxScore = s;
    338                 }
    339             }
    340         }
    341         return contactId;
    342     }
    343 
    344     /**
    345      * Returns matches in the order of descending score.
    346      */
    347     public List<MatchScore> pickBestMatches(int threshold) {
    348         int scaledThreshold = threshold * MatchScore.SCORE_SCALE;
    349         List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
    350         Collections.sort(matches);
    351         int count = 0;
    352         for (int i = 0; i < mScoreCount; i++) {
    353             MatchScore matchScore = matches.get(i);
    354             if (matchScore.getScore() >= scaledThreshold) {
    355                 count++;
    356             } else {
    357                 break;
    358             }
    359         }
    360 
    361         return matches.subList(0, count);
    362     }
    363 
    364     @Override
    365     public String toString() {
    366         return mScoreList.subList(0, mScoreCount).toString();
    367     }
    368 }
    369