1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16 package com.android.providers.contacts; 17 18 import com.android.providers.contacts.ContactsDatabaseHelper.NameLookupType; 19 20 import java.util.ArrayList; 21 import java.util.Collections; 22 import java.util.HashMap; 23 import java.util.List; 24 25 /** 26 * Logic for matching contacts' data and accumulating match scores. 27 */ 28 public class ContactMatcher { 29 30 // Best possible match score 31 public static final int MAX_SCORE = 100; 32 33 // Suggest to aggregate contacts if their match score is equal or greater than this threshold 34 public static final int SCORE_THRESHOLD_SUGGEST = 50; 35 36 // Automatically aggregate contacts if their match score is equal or greater than this threshold 37 public static final int SCORE_THRESHOLD_PRIMARY = 70; 38 39 // Automatically aggregate contacts if the match score is equal or greater than this threshold 40 // and there is a secondary match (phone number, email etc). 41 public static final int SCORE_THRESHOLD_SECONDARY = 50; 42 43 // Score for missing data (as opposed to present data but a bad match) 44 private static final int NO_DATA_SCORE = -1; 45 46 // Score for matching phone numbers 47 private static final int PHONE_MATCH_SCORE = 71; 48 49 // Score for matching email addresses 50 private static final int EMAIL_MATCH_SCORE = 71; 51 52 // Score for matching nickname 53 private static final int NICKNAME_MATCH_SCORE = 71; 54 55 // Maximum number of characters in a name to be considered by the matching algorithm. 56 private static final int MAX_MATCHED_NAME_LENGTH = 30; 57 58 // Scores a multiplied by this number to allow room for "fractional" scores 59 private static final int SCORE_SCALE = 1000; 60 61 public static final int MATCHING_ALGORITHM_EXACT = 0; 62 public static final int MATCHING_ALGORITHM_CONSERVATIVE = 1; 63 public static final int MATCHING_ALGORITHM_APPROXIMATE = 2; 64 65 // Minimum edit distance between two names to be considered an approximate match 66 public static final float APPROXIMATE_MATCH_THRESHOLD = 0.82f; 67 68 // Minimum edit distance between two email ids to be considered an approximate match 69 public static final float APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL = 0.95f; 70 71 /** 72 * Name matching scores: a matrix by name type vs. candidate lookup type. 73 * For example, if the name type is "full name" while we are looking for a 74 * "full name", the score may be 99. If we are looking for a "nickname" but 75 * find "first name", the score may be 50 (see specific scores defined 76 * below.) 77 * <p> 78 * For approximate matching, we have a range of scores, let's say 40-70. Depending one how 79 * similar the two strings are, the score will be somewhere between 40 and 70, with the exact 80 * match producing the score of 70. The score may also be 0 if the similarity (distance) 81 * between the strings is below the threshold. 82 * <p> 83 * We use a string matching algorithm, which is particularly suited for 84 * name matching. See {@link NameDistance}. 85 */ 86 private static int[] sMinScore = 87 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 88 private static int[] sMaxScore = 89 new int[NameLookupType.TYPE_COUNT * NameLookupType.TYPE_COUNT]; 90 91 /* 92 * Note: the reverse names ({@link NameLookupType#FULL_NAME_REVERSE}, 93 * {@link NameLookupType#FULL_NAME_REVERSE_CONCATENATED} may appear to be redundant. They are 94 * not! They are useful in three-way aggregation cases when we have, for example, both 95 * John Smith and Smith John. A third contact with the name John Smith should be aggregated 96 * with the former rather than the latter. This is why "reverse" matches have slightly lower 97 * scores than direct matches. 98 */ 99 static { 100 setScoreRange(NameLookupType.NAME_EXACT, 101 NameLookupType.NAME_EXACT, 99, 99); 102 setScoreRange(NameLookupType.NAME_VARIANT, 103 NameLookupType.NAME_VARIANT, 90, 90); 104 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 105 NameLookupType.NAME_COLLATION_KEY, 50, 80); 106 107 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 108 NameLookupType.EMAIL_BASED_NICKNAME, 30, 60); 109 setScoreRange(NameLookupType.NAME_COLLATION_KEY, 110 NameLookupType.NICKNAME, 50, 60); 111 112 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 113 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); 114 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 115 NameLookupType.NAME_COLLATION_KEY, 50, 60); 116 setScoreRange(NameLookupType.EMAIL_BASED_NICKNAME, 117 NameLookupType.NICKNAME, 50, 60); 118 119 setScoreRange(NameLookupType.NICKNAME, 120 NameLookupType.NICKNAME, 50, 60); 121 setScoreRange(NameLookupType.NICKNAME, 122 NameLookupType.NAME_COLLATION_KEY, 50, 60); 123 setScoreRange(NameLookupType.NICKNAME, 124 NameLookupType.EMAIL_BASED_NICKNAME, 50, 60); 125 } 126 127 /** 128 * Populates the cells of the score matrix and score span matrix 129 * corresponding to the {@code candidateNameType} and {@code nameType}. 130 */ 131 private static void setScoreRange(int candidateNameType, int nameType, int scoreFrom, int scoreTo) { 132 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 133 sMinScore[index] = scoreFrom; 134 sMaxScore[index] = scoreTo; 135 } 136 137 /** 138 * Returns the lower range for the match score for the given {@code candidateNameType} and 139 * {@code nameType}. 140 */ 141 private static int getMinScore(int candidateNameType, int nameType) { 142 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 143 return sMinScore[index]; 144 } 145 146 /** 147 * Returns the upper range for the match score for the given {@code candidateNameType} and 148 * {@code nameType}. 149 */ 150 private static int getMaxScore(int candidateNameType, int nameType) { 151 int index = nameType * NameLookupType.TYPE_COUNT + candidateNameType; 152 return sMaxScore[index]; 153 } 154 155 /** 156 * Captures the max score and match count for a specific contact. Used in an 157 * contactId - MatchScore map. 158 */ 159 public static class MatchScore implements Comparable<MatchScore> { 160 private long mContactId; 161 private boolean mKeepIn; 162 private boolean mKeepOut; 163 private int mPrimaryScore; 164 private int mSecondaryScore; 165 private int mMatchCount; 166 167 public MatchScore(long contactId) { 168 this.mContactId = contactId; 169 } 170 171 public void reset(long contactId) { 172 this.mContactId = contactId; 173 mKeepIn = false; 174 mKeepOut = false; 175 mPrimaryScore = 0; 176 mSecondaryScore = 0; 177 mMatchCount = 0; 178 } 179 180 public long getContactId() { 181 return mContactId; 182 } 183 184 public void updatePrimaryScore(int score) { 185 if (score > mPrimaryScore) { 186 mPrimaryScore = score; 187 } 188 mMatchCount++; 189 } 190 191 public void updateSecondaryScore(int score) { 192 if (score > mSecondaryScore) { 193 mSecondaryScore = score; 194 } 195 mMatchCount++; 196 } 197 198 public void keepIn() { 199 mKeepIn = true; 200 } 201 202 public void keepOut() { 203 mKeepOut = true; 204 } 205 206 public int getScore() { 207 if (mKeepOut) { 208 return 0; 209 } 210 211 if (mKeepIn) { 212 return MAX_SCORE; 213 } 214 215 int score = (mPrimaryScore > mSecondaryScore ? mPrimaryScore : mSecondaryScore); 216 217 // Ensure that of two contacts with the same match score the one with more matching 218 // data elements wins. 219 return score * SCORE_SCALE + mMatchCount; 220 } 221 222 /** 223 * Descending order of match score. 224 */ 225 public int compareTo(MatchScore another) { 226 return another.getScore() - getScore(); 227 } 228 229 @Override 230 public String toString() { 231 return mContactId + ": " + mPrimaryScore + "/" + mSecondaryScore + "(" + mMatchCount 232 + ")"; 233 } 234 } 235 236 private final HashMap<Long, MatchScore> mScores = new HashMap<Long, MatchScore>(); 237 private final ArrayList<MatchScore> mScoreList = new ArrayList<MatchScore>(); 238 private int mScoreCount = 0; 239 240 private final NameDistance mNameDistanceConservative = new NameDistance(); 241 private final NameDistance mNameDistanceApproximate = new NameDistance(MAX_MATCHED_NAME_LENGTH); 242 243 private MatchScore getMatchingScore(long contactId) { 244 MatchScore matchingScore = mScores.get(contactId); 245 if (matchingScore == null) { 246 if (mScoreList.size() > mScoreCount) { 247 matchingScore = mScoreList.get(mScoreCount); 248 matchingScore.reset(contactId); 249 } else { 250 matchingScore = new MatchScore(contactId); 251 mScoreList.add(matchingScore); 252 } 253 mScoreCount++; 254 mScores.put(contactId, matchingScore); 255 } 256 return matchingScore; 257 } 258 259 /** 260 * Checks if there is a match and updates the overall score for the 261 * specified contact for a discovered match. The new score is determined 262 * by the prior score, by the type of name we were looking for, the type 263 * of name we found and, if the match is approximate, the distance between the candidate and 264 * actual name. 265 */ 266 public void matchName(long contactId, int candidateNameType, String candidateName, 267 int nameType, String name, int algorithm) { 268 int maxScore = getMaxScore(candidateNameType, nameType); 269 if (maxScore == 0) { 270 return; 271 } 272 273 if (candidateName.equals(name)) { 274 updatePrimaryScore(contactId, maxScore); 275 return; 276 } 277 278 if (algorithm == MATCHING_ALGORITHM_EXACT) { 279 return; 280 } 281 282 int minScore = getMinScore(candidateNameType, nameType); 283 if (minScore == maxScore) { 284 return; 285 } 286 287 byte[] decodedCandidateName = Hex.decodeHex(candidateName); 288 byte[] decodedName = Hex.decodeHex(name); 289 290 NameDistance nameDistance = algorithm == MATCHING_ALGORITHM_CONSERVATIVE ? 291 mNameDistanceConservative : mNameDistanceApproximate; 292 293 int score; 294 float distance = nameDistance.getDistance(decodedCandidateName, decodedName); 295 boolean emailBased = candidateNameType == NameLookupType.EMAIL_BASED_NICKNAME 296 || nameType == NameLookupType.EMAIL_BASED_NICKNAME; 297 float threshold = emailBased 298 ? APPROXIMATE_MATCH_THRESHOLD_FOR_EMAIL 299 : APPROXIMATE_MATCH_THRESHOLD; 300 if (distance > threshold) { 301 score = (int)(minScore + (maxScore - minScore) * (1.0f - distance)); 302 } else { 303 score = 0; 304 } 305 306 updatePrimaryScore(contactId, score); 307 } 308 309 public void updateScoreWithPhoneNumberMatch(long contactId) { 310 updateSecondaryScore(contactId, PHONE_MATCH_SCORE); 311 } 312 313 public void updateScoreWithEmailMatch(long contactId) { 314 updateSecondaryScore(contactId, EMAIL_MATCH_SCORE); 315 } 316 317 public void updateScoreWithNicknameMatch(long contactId) { 318 updateSecondaryScore(contactId, NICKNAME_MATCH_SCORE); 319 } 320 321 private void updatePrimaryScore(long contactId, int score) { 322 getMatchingScore(contactId).updatePrimaryScore(score); 323 } 324 325 private void updateSecondaryScore(long contactId, int score) { 326 getMatchingScore(contactId).updateSecondaryScore(score); 327 } 328 329 public void keepIn(long contactId) { 330 getMatchingScore(contactId).keepIn(); 331 } 332 333 public void keepOut(long contactId) { 334 getMatchingScore(contactId).keepOut(); 335 } 336 337 public void clear() { 338 mScores.clear(); 339 mScoreCount = 0; 340 } 341 342 /** 343 * Returns a list of IDs for contacts that are matched on secondary data elements 344 * (phone number, email address, nickname). We still need to obtain the approximate 345 * primary score for those contacts to determine if any of them should be aggregated. 346 * <p> 347 * May return null. 348 */ 349 public List<Long> prepareSecondaryMatchCandidates(int threshold) { 350 ArrayList<Long> contactIds = null; 351 352 for (int i = 0; i < mScoreCount; i++) { 353 MatchScore score = mScoreList.get(i); 354 if (score.mKeepOut) { 355 continue; 356 } 357 358 int s = score.mSecondaryScore; 359 if (s >= threshold) { 360 if (contactIds == null) { 361 contactIds = new ArrayList<Long>(); 362 } 363 contactIds.add(score.mContactId); 364 } 365 score.mPrimaryScore = NO_DATA_SCORE; 366 } 367 return contactIds; 368 } 369 370 /** 371 * Returns the contactId with the best match score over the specified threshold or -1 372 * if no such contact is found. 373 */ 374 public long pickBestMatch(int threshold) { 375 long contactId = -1; 376 int maxScore = 0; 377 for (int i = 0; i < mScoreCount; i++) { 378 MatchScore score = mScoreList.get(i); 379 if (score.mKeepOut) { 380 continue; 381 } 382 383 if (score.mKeepIn) { 384 return score.mContactId; 385 } 386 387 int s = score.mPrimaryScore; 388 if (s == NO_DATA_SCORE) { 389 s = score.mSecondaryScore; 390 } 391 392 if (s >= threshold && s > maxScore) { 393 contactId = score.mContactId; 394 maxScore = s; 395 } 396 } 397 return contactId; 398 } 399 400 /** 401 * Returns matches in the order of descending score. 402 */ 403 public List<MatchScore> pickBestMatches(int threshold) { 404 int scaledThreshold = threshold * SCORE_SCALE; 405 List<MatchScore> matches = mScoreList.subList(0, mScoreCount); 406 Collections.sort(matches); 407 int count = 0; 408 for (int i = 0; i < mScoreCount; i++) { 409 MatchScore matchScore = matches.get(i); 410 if (matchScore.getScore() >= scaledThreshold) { 411 count++; 412 } else { 413 break; 414 } 415 } 416 417 return matches.subList(0, count); 418 } 419 420 @Override 421 public String toString() { 422 return mScoreList.toString(); 423 } 424 } 425