1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16 package com.android.providers.contacts.aggregation.util; 17 18 import android.util.Log; 19 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType; 20 import com.android.providers.contacts.util.Hex; 21 22 import java.util.ArrayList; 23 import java.util.Collections; 24 import java.util.HashMap; 25 import java.util.List; 26 27 /** 28 * Logic for matching raw contacts' data. 29 */ 30 public class RawContactMatcher { 31 private static final String TAG = "ContactMatcher"; 32 33 // Suggest to aggregate contacts if their match score is equal or greater than this threshold 34 public static final int SCORE_THRESHOLD_SUGGEST = 50; 35 36 public static final int SCORE_THRESHOLD_NO_NAME = 50; 37 38 // Automatically aggregate contacts if their match score is equal or greater than this threshold 39 public static final int SCORE_THRESHOLD_PRIMARY = 70; 40 41 // Automatically aggregate contacts if the match score is equal or greater than this threshold 42 // and there is a secondary match (phone number, email etc). 43 public static final int SCORE_THRESHOLD_SECONDARY = 50; 44 45 // Score for matching phone numbers 46 private static final int PHONE_MATCH_SCORE = 71; 47 48 // Score for matching email addresses 49 private static final int EMAIL_MATCH_SCORE = 71; 50 51 // Score for matching identity 52 private static final int IDENTITY_MATCH_SCORE = 71; 53 54 // Score for matching nickname 55 private static final int NICKNAME_MATCH_SCORE = 71; 56 57 // Maximum number of characters in a name to be considered by the matching algorithm. 58 private static final int MAX_MATCHED_NAME_LENGTH = 30; 59 60 // Scores a multiplied by this number to allow room for "fractional" scores 61 private static final int SCORE_SCALE = 1000; 62 63 public static final int MATCHING_ALGORITHM_EXACT = 0; 64 public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1; 65 public static final int MATCHING_ALGORITHM_APPROXIMATE = 2; 66 67 // Minimum edit distance between two names to be considered an approximate match 68 public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f; 69 70 // Minimum edit distance between two email ids to be considered an approximate match 71 public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f; 72 73 // Returned value when we found multiple matches and that was not allowed 74 public static final long MULTIPLE_MATCHES = -2; 75 76 /** 77 * Name matching scores: a matrix by name type vs. candidate lookup type. 78 * For example, if the name type is "full name" while we are looking for a 79 * "full name", the score may be 99. If we are looking for a "nickname" but 80 * find "first name", the score may be 50 (see specific scores defined 81 * below.) 82 * <p> 83 * For approximate matching, we have a range of scores, let's say 40-70. Depending one how 84 * similar the two strings are, the score will be somewhere between 40 and 70, with the exact 85 * match producing the score of 70. The score may also be 0 if the similarity (distance) 86 * between the strings is below the threshold. 87 * <p> 88 * We use a string matching algorithm, which is particularly suited for 89 * name matching. See {@link NameDistance}. 90 */ 91 private static int[] sMinScore = 92 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 93 private static int[] sMaxScore = 94 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 95 96 /* 97 * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE}, 98 * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are 99 * not! They are useful in three-way aggregation cases when we have, for example, both 100 * John Smith and Smith John. A third contact with the name John Smith should be aggregated 101 * with the former rather than the latter. This is why "reverse" matches have slightly lower 102 * scores than direct matches. 103 */ 104 static { 105 setScoreRange(NameLookupType.NAME_EXACT, 106 NameLookupType.NAME_EXACT, 99, 99); 107 setScoreRange(NameLookupType.NAME_VARIANT, 108 NameLookupType.NAME_VARIANT, 90, 90); 109 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 110 NameLookupType.NAME_COLLATION_KEY, 50, 80); 111 112 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 113 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60); 114 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 115 NameLookupType.NICKNAME, 50, 60); 116 117 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 118 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); 119 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 120 NameLookupType.NAME_COLLATION_KEY, 50, 60); 121 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 122 NameLookupType.NICKNAME, 50, 60); 123 124 setScoreRange(NameLookupType.NICKNAME, 125 NameLookupType.NICKNAME, 50, 60); 126 setScoreRange(NameLookupType.NICKNAME, 127 NameLookupType.NAME_COLLATION_KEY, 50, 60); 128 setScoreRange(NameLookupType.NICKNAME, 129 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); 130 } 131 132 /** 133 * Populates the cells of the score matrix and score span matrix 134 * corresponding to the {@code candidateNameType} and {@code nameType}. 135 */ 136 private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, 137 int scoreTo) { 138 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 139 sMinScore[index] = scoreFrom; 140 sMaxScore[index] = scoreTo; 141 } 142 143 /** 144 * Returns the lower range for the match score for the given {@code candidateNameType} and 145 * {@code nameType}. 146 */ 147 private static int getMinScore(int candidateNameType, int nameType) { 148 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 149 return sMinScore[index]; 150 } 151 152 /** 153 * Returns the upper range for the match score for the given {@code candidateNameType} and 154 * {@code nameType}. 155 */ 156 private static int getMaxScore(int candidateNameType, int nameType) { 157 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 158 return sMaxScore[index]; 159 } 160 161 private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>(); 162 private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>(); 163 private int mScoreCount = 0; 164 165 private final NameDistance mNameDistanceConservative = new NameDistance(); 166 private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH); 167 168 private MatchScore getMatchingScore(long rawContactId, long contactId, long accountId) { 169 MatchScore matchingScore = mScores.get(rawContactId); 170 if (matchingScore == null) { 171 if (mScoreList.size() > mScoreCount) { 172 matchingScore = mScoreList.get(mScoreCount); 173 matchingScore.reset(rawContactId, contactId, accountId); 174 } else { 175 matchingScore = new MatchScore(rawContactId, contactId, accountId); 176 mScoreList.add(matchingScore); 177 } 178 mScoreCount++; 179 mScores.put(rawContactId, matchingScore); 180 } 181 return matchingScore; 182 } 183 184 /** 185 * Checks if there is a match and updates the overall score for the 186 * specified contact for a discovered match. The new score is determined 187 * by the prior score, by the type of name we were looking for, the type 188 * of name we found and, if the match is approximate, the distance between the candidate and 189 * actual name. 190 */ 191 public void matchName(long rawContactId, long contactId, long accountId, int 192 candidateNameType, String candidateName, int nameType, String name, int algorithm) { 193 int maxScore = getMaxScore(candidateNameType, nameType); 194 if (maxScore == 0) { 195 return; 196 } 197 198 if (candidateName.equals(name)) { 199 updatePrimaryScore(rawContactId, contactId, accountId, maxScore); 200 return; 201 } 202 203 if (algorithm == MATCHING_ALGORITHM_EXACT) { 204 return; 205 } 206 207 int minScore = getMinScore(candidateNameType, nameType); 208 if (minScore == maxScore) { 209 return; 210 } 211 212 final byte[] decodedCandidateName; 213 final byte[] decodedName; 214 try { 215 decodedCandidateName = Hex.decodeHex(candidateName); 216 decodedName = Hex.decodeHex(name); 217 } catch (RuntimeException e) { 218 // How could this happen?? See bug 6827136 219 Log.e(TAG, "Failed to decode normalized name. Skipping.", e); 220 return; 221 } 222 223 NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ? 224 mNameDistanceConservative : mNameDistanceApproximate; 225 226 int score; 227 float distance = nameDistance.getDistance(decodedCandidateName, decodedName); 228 boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME 229 || nameType == NameLookupType.EMAIL_BASED_NICKNAME; 230 float threshold = emailBased 231 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL 232 : APPROXIMATE_MATCH_THRESHOLD; 233 if (distance > threshold) { 234 score = (int)(minScore + (maxScore - minScore) * (1.0f - distance)); 235 } else { 236 score = 0; 237 } 238 239 updatePrimaryScore(rawContactId, contactId, accountId, score); 240 } 241 242 public void matchIdentity(long rawContactId, long contactId, long accountId) { 243 updateSecondaryScore(rawContactId, contactId, accountId, IDENTITY_MATCH_SCORE); 244 } 245 246 public void updateScoreWithPhoneNumberMatch(long rawContactId, long contactId, long accountId) { 247 updateSecondaryScore(rawContactId, contactId, accountId, PHONE_MATCH_SCORE); 248 } 249 250 public void updateScoreWithEmailMatch(long rawContactId, long contactId, long accountId) { 251 updateSecondaryScore(rawContactId, contactId, accountId, EMAIL_MATCH_SCORE); 252 } 253 254 public void updateScoreWithNicknameMatch(long rawContactId, long contactId, long accountId) { 255 updateSecondaryScore(rawContactId, contactId, accountId, NICKNAME_MATCH_SCORE); 256 } 257 258 private void updatePrimaryScore(long rawContactId, long contactId, long accountId, int score) { 259 getMatchingScore(rawContactId, contactId, accountId).updatePrimaryScore(score); 260 } 261 262 private void updateSecondaryScore(long rawContactId, long contactId, long accountId, 263 int score) { 264 getMatchingScore(rawContactId, contactId, accountId).updateSecondaryScore(score); 265 } 266 267 public void keepIn(long rawContactId, long contactId, long accountId) { 268 getMatchingScore(rawContactId, contactId, accountId).keepIn(); 269 } 270 271 public void keepOut(long rawContactId, long contactId, long accountId) { 272 getMatchingScore(rawContactId, contactId, accountId).keepOut(); 273 } 274 275 public void clear() { 276 mScores.clear(); 277 mScoreCount = 0; 278 } 279 /** 280 * Returns a list of IDs for raw contacts that are only matched on secondary data elements 281 * (phone number, email address, nickname, identity). We need to check if they are missing 282 * structured name or not to decide if they should be aggregated. 283 * <p> 284 * May return null. 285 */ 286 public List<Long> prepareSecondaryMatchCandidates() { 287 ArrayList<Long> rawContactIds = null; 288 289 for (int i = 0; i < mScoreCount; i++) { 290 MatchScore score = mScoreList.get(i); 291 if (score.isKeepOut() || score.getPrimaryScore() > SCORE_THRESHOLD_PRIMARY){ 292 continue; 293 } 294 295 if (score.getSecondaryScore() >= SCORE_THRESHOLD_PRIMARY) { 296 if (rawContactIds == null) { 297 rawContactIds = new ArrayList<>(); 298 } 299 rawContactIds.add(score.getRawContactId()); 300 } 301 score.setPrimaryScore(0); 302 } 303 return rawContactIds; 304 } 305 306 /** 307 * Returns the list of raw contact Ids with the match score over threshold. 308 */ 309 public List<MatchScore> pickBestMatches() { 310 final List<MatchScore> matches = new ArrayList<>(); 311 for (int i = 0; i < mScoreCount; i++) { 312 MatchScore score = mScoreList.get(i); 313 if (score.isKeepOut()) { 314 continue; 315 } 316 317 if (score.isKeepIn()) { 318 matches.add(score); 319 continue; 320 } 321 322 if (score.getPrimaryScore() >= SCORE_THRESHOLD_PRIMARY || 323 (score.getPrimaryScore() == SCORE_THRESHOLD_NO_NAME && 324 score.getSecondaryScore() > SCORE_THRESHOLD_SECONDARY)) { 325 matches.add(score); 326 } 327 } 328 return matches; 329 } 330 331 /** 332 * Returns matches in the order of descending score. 333 */ 334 public List<MatchScore> pickBestMatches(int threshold) { 335 int scaledThreshold = threshold * SCORE_SCALE; 336 List<MatchScore> matches = mScoreList.subList(0, mScoreCount); 337 Collections.sort(matches); 338 int count = 0; 339 for (int i = 0; i < mScoreCount; i++) { 340 MatchScore matchScore = matches.get(i); 341 if (matchScore.getScore() >= scaledThreshold) { 342 count++; 343 } else { 344 break; 345 } 346 } 347 348 return matches.subList(0, count); 349 } 350 351 @Override 352 public String toString() { 353 return mScoreList.subList(0, mScoreCount).toString(); 354 } 355 356 public void matchNoName(Long rawContactId, Long contactId, Long accountId) { 357 updatePrimaryScore(rawContactId, contactId, accountId, SCORE_THRESHOLD_NO_NAME); 358 } 359 } 360