1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16 package com.android.providers.contacts; 17 18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType; 19 20 import java.util.ArrayList; 21 import java.util.Collections; 22 import java.util.HashMap; 23 import java.util.List; 24 25 /** 26 * Logic for matching contacts' data and accumulating match scores. 27 */ 28 public class ContactMatcher { 29 30 // Best possible match score 31 public static final int MAX_SCORE = 100; 32 33 // Suggest to aggregate contacts if their match score is equal or greater than this threshold 34 public static final int SCORE_THRESHOLD_SUGGEST = 50; 35 36 // Automatically aggregate contacts if their match score is equal or greater than this threshold 37 public static final int SCORE_THRESHOLD_PRIMARY = 70; 38 39 // Automatically aggregate contacts if the match score is equal or greater than this threshold 40 // and there is a secondary match (phone number, email etc). 41 public static final int SCORE_THRESHOLD_SECONDARY = 50; 42 43 // Score for missing data (as opposed to present data but a bad match) 44 private static final int NO_DATA_SCORE = -1; 45 46 // Score for matching phone numbers 47 private static final int PHONE_MATCH_SCORE = 71; 48 49 // Score for matching email addresses 50 private static final int EMAIL_MATCH_SCORE = 71; 51 52 // Score for matching nickname 53 private static final int NICKNAME_MATCH_SCORE = 71; 54 55 // Maximum number of characters in a name to be considered by the matching algorithm. 56 private static final int MAX_MATCHED_NAME_LENGTH = 30; 57 58 // Scores a multiplied by this number to allow room for "fractional" scores 59 private static final int SCORE_SCALE = 1000; 60 61 public static final int MATCHING_ALGORITHM_EXACT = 0; 62 public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1; 63 public static final int MATCHING_ALGORITHM_APPROXIMATE = 2; 64 65 // Minimum edit distance between two names to be considered an approximate match 66 public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f; 67 68 // Minimum edit distance between two email ids to be considered an approximate match 69 public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f; 70 71 // Returned value when we found multiple matches and that was not allowed 72 public static final long MULTIPLE_MATCHES = -2; 73 74 /** 75 * Name matching scores: a matrix by name type vs. candidate lookup type. 76 * For example, if the name type is "full name" while we are looking for a 77 * "full name", the score may be 99. If we are looking for a "nickname" but 78 * find "first name", the score may be 50 (see specific scores defined 79 * below.) 80 * <p> 81 * For approximate matching, we have a range of scores, let's say 40-70. Depending one how 82 * similar the two strings are, the score will be somewhere between 40 and 70, with the exact 83 * match producing the score of 70. The score may also be 0 if the similarity (distance) 84 * between the strings is below the threshold. 85 * <p> 86 * We use a string matching algorithm, which is particularly suited for 87 * name matching. See {@link NameDistance}. 88 */ 89 private static int[] sMinScore = 90 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 91 private static int[] sMaxScore = 92 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 93 94 /* 95 * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE}, 96 * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are 97 * not! They are useful in three-way aggregation cases when we have, for example, both 98 * John Smith and Smith John. A third contact with the name John Smith should be aggregated 99 * with the former rather than the latter. This is why "reverse" matches have slightly lower 100 * scores than direct matches. 101 */ 102 static { 103 setScoreRange(NameLookupType.NAME_EXACT, 104 NameLookupType.NAME_EXACT, 99, 99); 105 setScoreRange(NameLookupType.NAME_VARIANT, 106 NameLookupType.NAME_VARIANT, 90, 90); 107 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 108 NameLookupType.NAME_COLLATION_KEY, 50, 80); 109 110 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 111 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60); 112 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 113 NameLookupType.NICKNAME, 50, 60); 114 115 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 116 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); 117 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 118 NameLookupType.NAME_COLLATION_KEY, 50, 60); 119 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 120 NameLookupType.NICKNAME, 50, 60); 121 122 setScoreRange(NameLookupType.NICKNAME, 123 NameLookupType.NICKNAME, 50, 60); 124 setScoreRange(NameLookupType.NICKNAME, 125 NameLookupType.NAME_COLLATION_KEY, 50, 60); 126 setScoreRange(NameLookupType.NICKNAME, 127 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); 128 } 129 130 /** 131 * Populates the cells of the score matrix and score span matrix 132 * corresponding to the {@code candidateNameType} and {@code nameType}. 133 */ 134 private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) { 135 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 136 sMinScore[index] = scoreFrom; 137 sMaxScore[index] = scoreTo; 138 } 139 140 /** 141 * Returns the lower range for the match score for the given {@code candidateNameType} and 142 * {@code nameType}. 143 */ 144 private static int getMinScore(int candidateNameType, int nameType) { 145 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 146 return sMinScore[index]; 147 } 148 149 /** 150 * Returns the upper range for the match score for the given {@code candidateNameType} and 151 * {@code nameType}. 152 */ 153 private static int getMaxScore(int candidateNameType, int nameType) { 154 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 155 return sMaxScore[index]; 156 } 157 158 /** 159 * Captures the max score and match count for a specific contact. Used in an 160 * contactId - MatchScore map. 161 */ 162 public static class MatchScore implements Comparable<MatchScore> { 163 private long mContactId; 164 private boolean mKeepIn; 165 private boolean mKeepOut; 166 private int mPrimaryScore; 167 private int mSecondaryScore; 168 private int mMatchCount; 169 170 public MatchScore(long contactId) { 171 this.mContactId = contactId; 172 } 173 174 public void reset(long contactId) { 175 this.mContactId = contactId; 176 mKeepIn = false; 177 mKeepOut = false; 178 mPrimaryScore = 0; 179 mSecondaryScore = 0; 180 mMatchCount = 0; 181 } 182 183 public long getContactId() { 184 return mContactId; 185 } 186 187 public void updatePrimaryScore(int score) { 188 if (score > mPrimaryScore) { 189 mPrimaryScore = score; 190 } 191 mMatchCount++; 192 } 193 194 public void updateSecondaryScore(int score) { 195 if (score > mSecondaryScore) { 196 mSecondaryScore = score; 197 } 198 mMatchCount++; 199 } 200 201 public void keepIn() { 202 mKeepIn = true; 203 } 204 205 public void keepOut() { 206 mKeepOut = true; 207 } 208 209 public int getScore() { 210 if (mKeepOut) { 211 return 0; 212 } 213 214 if (mKeepIn) { 215 return MAX_SCORE; 216 } 217 218 int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore); 219 220 // Ensure that of two contacts with the same match score the one with more matching 221 // data elements wins. 222 return score * SCORE_SCALE + mMatchCount; 223 } 224 225 /** 226 * Descending order of match score. 227 */ 228 @Override 229 public int compareTo(MatchScore another) { 230 return another.getScore() - getScore(); 231 } 232 233 @Override 234 public String toString() { 235 return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount 236 + ")"; 237 } 238 } 239 240 private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>(); 241 private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>(); 242 private int mScoreCount = 0; 243 244 private final NameDistance mNameDistanceConservative = new NameDistance(); 245 private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH); 246 247 private MatchScore getMatchingScore(long contactId) { 248 MatchScore matchingScore = mScores.get(contactId); 249 if (matchingScore == null) { 250 if (mScoreList.size() > mScoreCount) { 251 matchingScore = mScoreList.get(mScoreCount); 252 matchingScore.reset(contactId); 253 } else { 254 matchingScore = new MatchScore(contactId); 255 mScoreList.add(matchingScore); 256 } 257 mScoreCount++; 258 mScores.put(contactId, matchingScore); 259 } 260 return matchingScore; 261 } 262 263 /** 264 * Marks the contact as a full match, because we found an Identity match 265 */ 266 public void matchIdentity(long contactId) { 267 updatePrimaryScore(contactId, MAX_SCORE); 268 } 269 270 /** 271 * Checks if there is a match and updates the overall score for the 272 * specified contact for a discovered match. The new score is determined 273 * by the prior score, by the type of name we were looking for, the type 274 * of name we found and, if the match is approximate, the distance between the candidate and 275 * actual name. 276 */ 277 public void matchName(long contactId, int candidateNameType, String candidateName, 278 int nameType, String name, int algorithm) { 279 int maxScore = getMaxScore(candidateNameType, nameType); 280 if (maxScore == 0) { 281 return; 282 } 283 284 if (candidateName.equals(name)) { 285 updatePrimaryScore(contactId, maxScore); 286 return; 287 } 288 289 if (algorithm == MATCHING_ALGORITHM_EXACT) { 290 return; 291 } 292 293 int minScore = getMinScore(candidateNameType, nameType); 294 if (minScore == maxScore) { 295 return; 296 } 297 298 byte[] decodedCandidateName = Hex.decodeHex(candidateName); 299 byte[] decodedName = Hex.decodeHex(name); 300 301 NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ? 302 mNameDistanceConservative : mNameDistanceApproximate; 303 304 int score; 305 float distance = nameDistance.getDistance(decodedCandidateName, decodedName); 306 boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME 307 || nameType == NameLookupType.EMAIL_BASED_NICKNAME; 308 float threshold = emailBased 309 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL 310 : APPROXIMATE_MATCH_THRESHOLD; 311 if (distance > threshold) { 312 score = (int)(minScore + (maxScore - minScore) * (1.0f - distance)); 313 } else { 314 score = 0; 315 } 316 317 updatePrimaryScore(contactId, score); 318 } 319 320 public void updateScoreWithPhoneNumberMatch(long contactId) { 321 updateSecondaryScore(contactId, PHONE_MATCH_SCORE); 322 } 323 324 public void updateScoreWithEmailMatch(long contactId) { 325 updateSecondaryScore(contactId, EMAIL_MATCH_SCORE); 326 } 327 328 public void updateScoreWithNicknameMatch(long contactId) { 329 updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE); 330 } 331 332 private void updatePrimaryScore(long contactId, int score) { 333 getMatchingScore(contactId).updatePrimaryScore(score); 334 } 335 336 private void updateSecondaryScore(long contactId, int score) { 337 getMatchingScore(contactId).updateSecondaryScore(score); 338 } 339 340 public void keepIn(long contactId) { 341 getMatchingScore(contactId).keepIn(); 342 } 343 344 public void keepOut(long contactId) { 345 getMatchingScore(contactId).keepOut(); 346 } 347 348 public void clear() { 349 mScores.clear(); 350 mScoreCount = 0; 351 } 352 353 /** 354 * Returns a list of IDs for contacts that are matched on secondary data elements 355 * (phone number, email address, nickname). We still need to obtain the approximate 356 * primary score for those contacts to determine if any of them should be aggregated. 357 * <p> 358 * May return null. 359 */ 360 public List<Long> prepareSecondaryMatchCandidates(int threshold) { 361 ArrayList<Long> contactIds = null; 362 363 for (int i = 0; i < mScoreCount; i++) { 364 MatchScore score = mScoreList.get(i); 365 if (score.mKeepOut) { 366 continue; 367 } 368 369 int s = score.mSecondaryScore; 370 if (s >= threshold) { 371 if (contactIds == null) { 372 contactIds = new ArrayList<Long>(); 373 } 374 contactIds.add(score.mContactId); 375 } 376 score.mPrimaryScore = NO_DATA_SCORE; 377 } 378 return contactIds; 379 } 380 381 /** 382 * Returns the contactId with the best match score over the specified threshold or -1 383 * if no such contact is found. 384 */ 385 public long pickBestMatch(int threshold, boolean allowMultipleMatches) { 386 long contactId = -1; 387 int maxScore = 0; 388 for (int i = 0; i < mScoreCount; i++) { 389 MatchScore score = mScoreList.get(i); 390 if (score.mKeepOut) { 391 continue; 392 } 393 394 if (score.mKeepIn) { 395 return score.mContactId; 396 } 397 398 int s = score.mPrimaryScore; 399 if (s == NO_DATA_SCORE) { 400 s = score.mSecondaryScore; 401 } 402 403 if (s >= threshold) { 404 if (contactId != -1 && !allowMultipleMatches) { 405 return MULTIPLE_MATCHES; 406 } 407 if (s > maxScore) { 408 contactId = score.mContactId; 409 maxScore = s; 410 } 411 } 412 } 413 return contactId; 414 } 415 416 /** 417 * Returns matches in the order of descending score. 418 */ 419 public List<MatchScore> pickBestMatches(int threshold) { 420 int scaledThreshold = threshold * SCORE_SCALE; 421 List<MatchScore> matches = mScoreList.subList(0, mScoreCount); 422 Collections.sort(matches); 423 int count = 0; 424 for (int i = 0; i < mScoreCount; i++) { 425 MatchScore matchScore = matches.get(i); 426 if (matchScore.getScore() >= scaledThreshold) { 427 count++; 428 } else { 429 break; 430 } 431 } 432 433 return matches.subList(0, count); 434 } 435 436 @Override 437 public String toString() { 438 return mScoreList.subList(0, mScoreCount).toString(); 439 } 440 } 441