Home | History | Annotate | Download | only in util
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License
     15  */
     16 package com.android.providers.contacts.aggregation.util;
     17 
     18 import android.util.Log;
     19 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType;
     20 import com.android.providers.contacts.util.Hex;
     21 
     22 import java.util.ArrayList;
     23 import java.util.Collections;
     24 import java.util.HashMap;
     25 import java.util.List;
     26 
     27 /**
     28  * Logic for matching raw contacts' data.
     29  */
     30 public class RawContactMatcher {
     31     private static final String TAG = "ContactMatcher";
     32 
     33     // Suggest to aggregate contacts if their match score is equal or greater than this threshold
     34     public static final int SCORE_THRESHOLD_SUGGEST = 50;
     35 
     36     public static final int SCORE_THRESHOLD_NO_NAME = 50;
     37 
     38     // Automatically aggregate contacts if their match score is equal or greater than this threshold
     39     public static final int SCORE_THRESHOLD_PRIMARY = 70;
     40 
     41     // Automatically aggregate contacts if the match score is equal or greater than this threshold
     42     // and there is a secondary match (phone number, email etc).
     43     public static final int SCORE_THRESHOLD_SECONDARY = 50;
     44 
     45     // Score for matching phone numbers
     46     private static final int PHONE_MATCH_SCORE = 71;
     47 
     48     // Score for matching email addresses
     49     private static final int EMAIL_MATCH_SCORE = 71;
     50 
     51     // Score for matching identity
     52     private static final int IDENTITY_MATCH_SCORE = 71;
     53 
     54     // Score for matching nickname
     55     private static final int NICKNAME_MATCH_SCORE = 71;
     56 
     57     // Maximum number of characters in a name to be considered by the matching algorithm.
     58     private static final int MAX_MATCHED_NAME_LENGTH = 30;
     59 
     60     // Scores a multiplied by this number to allow room for "fractional" scores
     61     private static final int SCORE_SCALE = 1000;
     62 
     63     public static final int MATCHING_ALGORITHM_EXACT = 0;
     64     public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1;
     65     public static final int MATCHING_ALGORITHM_APPROXIMATE = 2;
     66 
     67     // Minimum edit distance between two names to be considered an approximate match
     68     public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f;
     69 
     70     // Minimum edit distance between two email ids to be considered an approximate match
     71     public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f;
     72 
     73     // Returned value when we found multiple matches and that was not allowed
     74     public static final long MULTIPLE_MATCHES = -2;
     75 
     76     /**
     77      * Name matching scores: a matrix by name type vs. candidate lookup type.
     78      * For example, if the name type is "full name" while we are looking for a
     79      * "full name", the score may be 99. If we are looking for a "nickname" but
     80      * find "first name", the score may be 50 (see specific scores defined
     81      * below.)
     82      * <p>
     83      * For approximate matching, we have a range of scores, let's say 40-70.  Depending one how
     84      * similar the two strings are, the score will be somewhere between 40 and 70, with the exact
     85      * match producing the score of 70.  The score may also be 0 if the similarity (distance)
     86      * between the strings is below the threshold.
     87      * <p>
     88      * We use a string matching algorithm, which is particularly suited for
     89      * name matching. See {@link NameDistance}.
     90      */
     91     private static int[] sMinScore =
     92             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
     93     private static int[] sMaxScore =
     94             new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT];
     95 
     96     /*
     97      * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE},
     98      * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are
     99      * not!  They are useful in three-way aggregation cases when we have, for example, both
    100      * John Smith and Smith John.  A third contact with the name John Smith should be aggregated
    101      * with the former rather than the latter.  This is why "reverse" matches have slightly lower
    102      * scores than direct matches.
    103      */
    104     static {
    105         setScoreRange(NameLookupType.NAME_EXACT,
    106                 NameLookupType.NAME_EXACT, 99, 99);
    107         setScoreRange(NameLookupType.NAME_VARIANT,
    108                 NameLookupType.NAME_VARIANT, 90, 90);
    109         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    110                 NameLookupType.NAME_COLLATION_KEY, 50, 80);
    111 
    112         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    113                 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60);
    114         setScoreRange(NameLookupType.NAME_COLLATION_KEY,
    115                 NameLookupType.NICKNAME, 50, 60);
    116 
    117         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    118                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
    119         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    120                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
    121         setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME,
    122                 NameLookupType.NICKNAME, 50, 60);
    123 
    124         setScoreRange(NameLookupType.NICKNAME,
    125                 NameLookupType.NICKNAME, 50, 60);
    126         setScoreRange(NameLookupType.NICKNAME,
    127                 NameLookupType.NAME_COLLATION_KEY, 50, 60);
    128         setScoreRange(NameLookupType.NICKNAME,
    129                 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60);
    130     }
    131 
    132     /**
    133      * Populates the cells of the score matrix and score span matrix
    134      * corresponding to the {@code candidateNameType} and {@code nameType}.
    135      */
    136     private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom,
    137             int scoreTo) {
    138         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    139         sMinScore[index] = scoreFrom;
    140         sMaxScore[index] = scoreTo;
    141     }
    142 
    143     /**
    144      * Returns the lower range for the match score for the given {@code candidateNameType} and
    145      * {@code nameType}.
    146      */
    147     private static int getMinScore(int candidateNameType, int nameType) {
    148         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    149         return sMinScore[index];
    150     }
    151 
    152     /**
    153      * Returns the upper range for the match score for the given {@code candidateNameType} and
    154      * {@code nameType}.
    155      */
    156     private static int getMaxScore(int candidateNameType, int nameType) {
    157         int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType;
    158         return sMaxScore[index];
    159     }
    160 
    161     private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>();
    162     private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>();
    163     private int mScoreCount = 0;
    164 
    165     private final NameDistance mNameDistanceConservative = new NameDistance();
    166     private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH);
    167 
    168     private MatchScore getMatchingScore(long rawContactId, long contactId, long accountId) {
    169         MatchScore matchingScore = mScores.get(rawContactId);
    170         if (matchingScore == null) {
    171             if (mScoreList.size() > mScoreCount) {
    172                 matchingScore = mScoreList.get(mScoreCount);
    173                 matchingScore.reset(rawContactId, contactId, accountId);
    174             } else {
    175                 matchingScore = new MatchScore(rawContactId, contactId, accountId);
    176                 mScoreList.add(matchingScore);
    177             }
    178             mScoreCount++;
    179             mScores.put(rawContactId, matchingScore);
    180         }
    181         return matchingScore;
    182     }
    183 
    184     /**
    185      * Checks if there is a match and updates the overall score for the
    186      * specified contact for a discovered match. The new score is determined
    187      * by the prior score, by the type of name we were looking for, the type
    188      * of name we found and, if the match is approximate, the distance between the candidate and
    189      * actual name.
    190      */
    191     public void matchName(long rawContactId, long contactId, long accountId, int
    192             candidateNameType, String candidateName, int nameType, String name, int algorithm) {
    193         int maxScore = getMaxScore(candidateNameType, nameType);
    194         if (maxScore == 0) {
    195             return;
    196         }
    197 
    198         if (candidateName.equals(name)) {
    199             updatePrimaryScore(rawContactId, contactId, accountId, maxScore);
    200             return;
    201         }
    202 
    203         if (algorithm == MATCHING_ALGORITHM_EXACT) {
    204             return;
    205         }
    206 
    207         int minScore = getMinScore(candidateNameType, nameType);
    208         if (minScore == maxScore) {
    209             return;
    210         }
    211 
    212         final byte[] decodedCandidateName;
    213         final byte[] decodedName;
    214         try {
    215             decodedCandidateName = Hex.decodeHex(candidateName);
    216             decodedName = Hex.decodeHex(name);
    217         } catch (RuntimeException e) {
    218             // How could this happen??  See bug 6827136
    219             Log.e(TAG, "Failed to decode normalized name.  Skipping.", e);
    220             return;
    221         }
    222 
    223         NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ?
    224                 mNameDistanceConservative : mNameDistanceApproximate;
    225 
    226         int score;
    227         float distance = nameDistance.getDistance(decodedCandidateName, decodedName);
    228         boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME
    229                 || nameType == NameLookupType.EMAIL_BASED_NICKNAME;
    230         float threshold = emailBased
    231                 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL
    232                 : APPROXIMATE_MATCH_THRESHOLD;
    233         if (distance > threshold) {
    234             score = (int)(minScore +  (maxScore - minScore) * (1.0f - distance));
    235         } else {
    236             score = 0;
    237         }
    238 
    239         updatePrimaryScore(rawContactId, contactId, accountId, score);
    240     }
    241 
    242     public void matchIdentity(long rawContactId, long contactId, long accountId) {
    243         updateSecondaryScore(rawContactId, contactId, accountId, IDENTITY_MATCH_SCORE);
    244     }
    245 
    246     public void updateScoreWithPhoneNumberMatch(long rawContactId, long contactId, long accountId) {
    247         updateSecondaryScore(rawContactId, contactId, accountId, PHONE_MATCH_SCORE);
    248     }
    249 
    250     public void updateScoreWithEmailMatch(long rawContactId, long contactId, long accountId) {
    251         updateSecondaryScore(rawContactId, contactId, accountId, EMAIL_MATCH_SCORE);
    252     }
    253 
    254     public void updateScoreWithNicknameMatch(long rawContactId, long contactId, long accountId) {
    255         updateSecondaryScore(rawContactId, contactId, accountId, NICKNAME_MATCH_SCORE);
    256     }
    257 
    258     private void updatePrimaryScore(long rawContactId, long contactId, long accountId, int score) {
    259         getMatchingScore(rawContactId, contactId, accountId).updatePrimaryScore(score);
    260     }
    261 
    262     private void updateSecondaryScore(long rawContactId, long contactId, long accountId,
    263             int score) {
    264         getMatchingScore(rawContactId, contactId, accountId).updateSecondaryScore(score);
    265     }
    266 
    267     public void keepIn(long rawContactId, long contactId, long accountId) {
    268         getMatchingScore(rawContactId, contactId, accountId).keepIn();
    269     }
    270 
    271     public void keepOut(long rawContactId, long contactId, long accountId) {
    272         getMatchingScore(rawContactId, contactId, accountId).keepOut();
    273     }
    274 
    275     public void clear() {
    276         mScores.clear();
    277         mScoreCount = 0;
    278     }
    279     /**
    280      * Returns a list of IDs for raw contacts that are only matched on secondary data elements
    281      * (phone number, email address, nickname, identity). We need to check if they are missing
    282      * structured name or not to decide if they should be aggregated.
    283      * <p>
    284      * May return null.
    285      */
    286     public List<Long> prepareSecondaryMatchCandidates() {
    287         ArrayList<Long> rawContactIds = null;
    288 
    289         for (int i = 0; i < mScoreCount; i++) {
    290             MatchScore score = mScoreList.get(i);
    291             if (score.isKeepOut() ||  score.getPrimaryScore() > SCORE_THRESHOLD_PRIMARY){
    292                 continue;
    293             }
    294 
    295             if (score.getSecondaryScore() >= SCORE_THRESHOLD_PRIMARY) {
    296                 if (rawContactIds == null) {
    297                     rawContactIds = new ArrayList<>();
    298                 }
    299                 rawContactIds.add(score.getRawContactId());
    300             }
    301             score.setPrimaryScore(0);
    302         }
    303         return rawContactIds;
    304     }
    305 
    306     /**
    307      * Returns the list of raw contact Ids with the match score over threshold.
    308      */
    309     public List<MatchScore> pickBestMatches() {
    310         final List<MatchScore> matches = new ArrayList<>();
    311         for (int i = 0; i < mScoreCount; i++) {
    312             MatchScore score = mScoreList.get(i);
    313             if (score.isKeepOut()) {
    314                 continue;
    315             }
    316 
    317             if (score.isKeepIn()) {
    318                 matches.add(score);
    319                 continue;
    320             }
    321 
    322             if (score.getPrimaryScore() >= SCORE_THRESHOLD_PRIMARY ||
    323                     (score.getPrimaryScore() == SCORE_THRESHOLD_NO_NAME &&
    324                             score.getSecondaryScore() > SCORE_THRESHOLD_SECONDARY)) {
    325                 matches.add(score);
    326             }
    327         }
    328         return matches;
    329     }
    330 
    331     /**
    332      * Returns matches in the order of descending score.
    333      */
    334     public List<MatchScore> pickBestMatches(int threshold) {
    335         int scaledThreshold = threshold * SCORE_SCALE;
    336         List<MatchScore> matches = mScoreList.subList(0, mScoreCount);
    337         Collections.sort(matches);
    338         int count = 0;
    339         for (int i = 0; i < mScoreCount; i++) {
    340             MatchScore matchScore = matches.get(i);
    341             if (matchScore.getScore() >= scaledThreshold) {
    342                 count++;
    343             } else {
    344                 break;
    345             }
    346         }
    347 
    348         return matches.subList(0, count);
    349     }
    350 
    351     @Override
    352     public String toString() {
    353         return mScoreList.subList(0, mScoreCount).toString();
    354     }
    355 
    356     public void matchNoName(Long rawContactId, Long contactId, Long accountId) {
    357         updatePrimaryScore(rawContactId, contactId, accountId, SCORE_THRESHOLD_NO_NAME);
    358     }
    359 }
    360