1 /* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.dialer.dialpad; 18 19 import android.text.TextUtils; 20 21 import com.android.dialer.dialpad.SmartDialTrie.CountryCodeWithOffset; 22 23 import com.google.common.annotations.VisibleForTesting; 24 import com.google.common.collect.Lists; 25 26 import java.util.ArrayList; 27 28 /** 29 * {@link #SmartDialNameMatcher} contains utility functions to remove accents from accented 30 * characters and normalize a phone number. It also contains the matching logic that determines if 31 * a contact's display name matches a numeric query. The boolean variable 32 * {@link #ALLOW_INITIAL_MATCH} controls the behavior of the matching logic and determines 33 * whether we allow matches like 57 - (J)ohn (S)mith. 34 */ 35 public class SmartDialNameMatcher { 36 37 private final String mQuery; 38 39 public static final char[] LATIN_LETTERS_TO_DIGITS = { 40 '2', '2', '2', // A,B,C -> 2 41 '3', '3', '3', // D,E,F -> 3 42 '4', '4', '4', // G,H,I -> 4 43 '5', '5', '5', // J,K,L -> 5 44 '6', '6', '6', // M,N,O -> 6 45 '7', '7', '7', '7', // P,Q,R,S -> 7 46 '8', '8', '8', // T,U,V -> 8 47 '9', '9', '9', '9' // W,X,Y,Z -> 9 48 }; 49 50 // Whether or not we allow matches like 57 - (J)ohn (S)mith 51 private static final boolean ALLOW_INITIAL_MATCH = true; 52 53 // The maximum length of the initial we will match - typically set to 1 to minimize false 54 // positives 55 private static final int INITIAL_LENGTH_LIMIT = 1; 56 57 /* 58 * The switch statement in this function was generated using the python code: 59 * from unidecode import unidecode 60 * for i in range(192, 564): 61 * char = unichr(i) 62 * decoded = unidecode(char) 63 * # Unicode characters that decompose into multiple characters i.e. 64 * # into ss are not supported for now 65 * if (len(decoded) == 1 and decoded.isalpha()): 66 * print "case '" + char + "': return '" + unidecode(char) + "';" 67 * 68 * This gives us a way to map characters containing accents/diacritics to their 69 * alphabetic equivalents. The unidecode library can be found at: 70 * http://pypi.python.org/pypi/Unidecode/0.04.1 71 * 72 * Also remaps all upper case latin characters to their lower case equivalents. 73 */ 74 public static char remapAccentedChars(char c) { 75 switch (c) { 76 case '': return 'a'; 77 case '': return 'a'; 78 case '': return 'a'; 79 case '': return 'a'; 80 case '': return 'a'; 81 case '': return 'a'; 82 case '': return 'c'; 83 case '': return 'e'; 84 case '': return 'e'; 85 case '': return 'e'; 86 case '': return 'e'; 87 case '': return 'i'; 88 case '': return 'i'; 89 case '': return 'i'; 90 case '': return 'i'; 91 case '': return 'd'; 92 case '': return 'n'; 93 case '': return 'o'; 94 case '': return 'o'; 95 case '': return 'o'; 96 case '': return 'o'; 97 case '': return 'o'; 98 case '': return 'x'; 99 case '': return 'o'; 100 case '': return 'u'; 101 case '': return 'u'; 102 case '': return 'u'; 103 case '': return 'u'; 104 case '': return 'u'; 105 case '': return 'a'; 106 case '': return 'a'; 107 case '': return 'a'; 108 case '': return 'a'; 109 case '': return 'a'; 110 case '': return 'a'; 111 case '': return 'c'; 112 case '': return 'e'; 113 case '': return 'e'; 114 case '': return 'e'; 115 case '': return 'e'; 116 case '': return 'i'; 117 case '': return 'i'; 118 case '': return 'i'; 119 case '': return 'i'; 120 case '': return 'd'; 121 case '': return 'n'; 122 case '': return 'o'; 123 case '': return 'o'; 124 case '': return 'o'; 125 case '': return 'o'; 126 case '': return 'o'; 127 case '': return 'o'; 128 case '': return 'u'; 129 case '': return 'u'; 130 case '': return 'u'; 131 case '': return 'u'; 132 case '': return 'y'; 133 case '': return 'y'; 134 case '': return 'a'; 135 case '': return 'a'; 136 case '': return 'a'; 137 case '': return 'a'; 138 case '': return 'a'; 139 case '': return 'a'; 140 case '': return 'c'; 141 case '': return 'c'; 142 case '': return 'c'; 143 case '': return 'c'; 144 case '': return 'c'; 145 case '': return 'c'; 146 case '': return 'c'; 147 case '': return 'c'; 148 case '': return 'd'; 149 case '': return 'd'; 150 case '': return 'd'; 151 case '': return 'd'; 152 case '': return 'e'; 153 case '': return 'e'; 154 case '': return 'e'; 155 case '': return 'e'; 156 case '': return 'e'; 157 case '': return 'e'; 158 case '': return 'e'; 159 case '': return 'e'; 160 case '': return 'e'; 161 case '': return 'e'; 162 case '': return 'g'; 163 case '': return 'g'; 164 case '': return 'g'; 165 case '': return 'g'; 166 case '': return 'g'; 167 case '': return 'g'; 168 case '': return 'g'; 169 case '': return 'g'; 170 case '': return 'h'; 171 case '': return 'h'; 172 case '': return 'h'; 173 case '': return 'h'; 174 case '': return 'i'; 175 case '': return 'i'; 176 case '': return 'i'; 177 case '': return 'i'; 178 case '': return 'i'; 179 case '': return 'i'; 180 case '': return 'i'; 181 case '': return 'i'; 182 case '': return 'i'; 183 case '': return 'i'; 184 case '': return 'j'; 185 case '': return 'j'; 186 case '': return 'k'; 187 case '': return 'k'; 188 case '': return 'k'; 189 case '': return 'l'; 190 case '': return 'l'; 191 case '': return 'l'; 192 case '': return 'l'; 193 case '': return 'l'; 194 case '': return 'l'; 195 case '': return 'l'; 196 case '': return 'l'; 197 case '': return 'l'; 198 case '': return 'l'; 199 case '': return 'n'; 200 case '': return 'n'; 201 case '': return 'n'; 202 case '': return 'n'; 203 case '': return 'n'; 204 case '': return 'n'; 205 case '': return 'o'; 206 case '': return 'o'; 207 case '': return 'o'; 208 case '': return 'o'; 209 case '': return 'o'; 210 case '': return 'o'; 211 case '': return 'r'; 212 case '': return 'r'; 213 case '': return 'r'; 214 case '': return 'r'; 215 case '': return 'r'; 216 case '': return 'r'; 217 case '': return 's'; 218 case '': return 's'; 219 case '': return 's'; 220 case '': return 's'; 221 case '': return 's'; 222 case '': return 's'; 223 case '': return 's'; 224 case '': return 's'; 225 case '': return 't'; 226 case '': return 't'; 227 case '': return 't'; 228 case '': return 't'; 229 case '': return 't'; 230 case '': return 't'; 231 case '': return 'u'; 232 case '': return 'u'; 233 case '': return 'u'; 234 case '': return 'u'; 235 case '': return 'u'; 236 case '': return 'u'; 237 case '': return 'u'; 238 case '': return 'u'; 239 case '': return 'u'; 240 case '': return 'u'; 241 case '': return 'u'; 242 case '': return 'u'; 243 case '': return 'w'; 244 case '': return 'w'; 245 case '': return 'y'; 246 case '': return 'y'; 247 case '': return 'y'; 248 case '': return 'z'; 249 case '': return 'z'; 250 case '': return 'z'; 251 case '': return 'z'; 252 case '': return 'z'; 253 case '': return 'z'; 254 case '': return 's'; 255 case '': return 'b'; 256 case '': return 'b'; 257 case '': return 'b'; 258 case '': return 'b'; 259 case '': return 'o'; 260 case '': return 'c'; 261 case '': return 'c'; 262 case '': return 'd'; 263 case '': return 'd'; 264 case '': return 'd'; 265 case '': return 'd'; 266 case '': return 'd'; 267 case '': return 'e'; 268 case '': return 'f'; 269 case '': return 'f'; 270 case '': return 'g'; 271 case '': return 'g'; 272 case '': return 'i'; 273 case '': return 'i'; 274 case '': return 'k'; 275 case '': return 'k'; 276 case '': return 'l'; 277 case '': return 'l'; 278 case '': return 'w'; 279 case '': return 'n'; 280 case '': return 'n'; 281 case '': return 'o'; 282 case '': return 'o'; 283 case '': return 'o'; 284 case '': return 'p'; 285 case '': return 'p'; 286 case '': return 't'; 287 case '': return 't'; 288 case '': return 't'; 289 case '': return 't'; 290 case '': return 'u'; 291 case '': return 'u'; 292 case '': return 'y'; 293 case '': return 'v'; 294 case '': return 'y'; 295 case '': return 'y'; 296 case '': return 'z'; 297 case '': return 'z'; 298 case '': return 'w'; 299 case '': return 'a'; 300 case '': return 'a'; 301 case '': return 'i'; 302 case '': return 'i'; 303 case '': return 'o'; 304 case '': return 'o'; 305 case '': return 'u'; 306 case '': return 'u'; 307 case '': return 'u'; 308 case '': return 'u'; 309 case '': return 'u'; 310 case '': return 'u'; 311 case '': return 'u'; 312 case '': return 'u'; 313 case '': return 'u'; 314 case '': return 'u'; 315 case '': return 'a'; 316 case '': return 'a'; 317 case '': return 'a'; 318 case '': return 'a'; 319 case '': return 'g'; 320 case '': return 'g'; 321 case '': return 'g'; 322 case '': return 'g'; 323 case '': return 'k'; 324 case '': return 'k'; 325 case '': return 'o'; 326 case '': return 'o'; 327 case '': return 'o'; 328 case '': return 'o'; 329 case '': return 'j'; 330 case '': return 'd'; 331 case '': return 'g'; 332 case '': return 'g'; 333 case '': return 'w'; 334 case '': return 'n'; 335 case '': return 'n'; 336 case '': return 'a'; 337 case '': return 'a'; 338 case '': return 'o'; 339 case '': return 'o'; 340 case '': return 'a'; 341 case '': return 'a'; 342 case '': return 'a'; 343 case '': return 'a'; 344 case '': return 'e'; 345 case '': return 'e'; 346 case '': return 'e'; 347 case '': return 'e'; 348 case '': return 'i'; 349 case '': return 'i'; 350 case '': return 'i'; 351 case '': return 'i'; 352 case '': return 'o'; 353 case '': return 'o'; 354 case '': return 'o'; 355 case '': return 'o'; 356 case '': return 'r'; 357 case '': return 'r'; 358 case '': return 'r'; 359 case '': return 'r'; 360 case '': return 'u'; 361 case '': return 'u'; 362 case '': return 'u'; 363 case '': return 'u'; 364 case '': return 's'; 365 case '': return 's'; 366 case '': return 't'; 367 case '': return 't'; 368 case '': return 'y'; 369 case '': return 'y'; 370 case '': return 'h'; 371 case '': return 'h'; 372 case '': return 'z'; 373 case '': return 'z'; 374 case '': return 'a'; 375 case '': return 'a'; 376 case '': return 'e'; 377 case '': return 'e'; 378 case '': return 'o'; 379 case '': return 'o'; 380 case '': return 'o'; 381 case '': return 'o'; 382 case '': return 'o'; 383 case '': return 'o'; 384 case '': return 'o'; 385 case '': return 'o'; 386 case '': return 'y'; 387 case '': return 'y'; 388 case 'A': return 'a'; 389 case 'B': return 'b'; 390 case 'C': return 'c'; 391 case 'D': return 'd'; 392 case 'E': return 'e'; 393 case 'F': return 'f'; 394 case 'G': return 'g'; 395 case 'H': return 'h'; 396 case 'I': return 'i'; 397 case 'J': return 'j'; 398 case 'K': return 'k'; 399 case 'L': return 'l'; 400 case 'M': return 'm'; 401 case 'N': return 'n'; 402 case 'O': return 'o'; 403 case 'P': return 'p'; 404 case 'Q': return 'q'; 405 case 'R': return 'r'; 406 case 'S': return 's'; 407 case 'T': return 't'; 408 case 'U': return 'u'; 409 case 'V': return 'v'; 410 case 'W': return 'w'; 411 case 'X': return 'x'; 412 case 'Y': return 'y'; 413 case 'Z': return 'z'; 414 default: return c; 415 } 416 } 417 418 private final ArrayList<SmartDialMatchPosition> mMatchPositions = Lists.newArrayList(); 419 420 public SmartDialNameMatcher(String query) { 421 mQuery = query; 422 } 423 424 /** 425 * Strips a phone number of unnecessary characters (spaces, dashes, etc.) 426 * 427 * @param number Phone number we want to normalize 428 * @return Phone number consisting of digits from 0-9 429 */ 430 public static String normalizeNumber(String number) { 431 return normalizeNumber(number, 0); 432 } 433 434 /** 435 * Strips a phone number of unnecessary characters (spaces, dashes, etc.) 436 * 437 * @param number Phone number we want to normalize 438 * @param offset Offset to start from 439 * @return Phone number consisting of digits from 0-9 440 */ 441 public static String normalizeNumber(String number, int offset) { 442 final StringBuilder s = new StringBuilder(); 443 for (int i = offset; i < number.length(); i++) { 444 char ch = number.charAt(i); 445 if (ch >= '0' && ch <= '9') { 446 s.append(ch); 447 } 448 } 449 return s.toString(); 450 } 451 452 /** 453 * Matches a phone number against a query, taking care of formatting characters and also 454 * taking into account country code prefixes and special NANP number treatment. 455 * 456 * @param phoneNumber - Raw phone number 457 * @param query - Normalized query (only contains numbers from 0-9) 458 * @param matchNanp - Whether or not to do special matching for NANP numbers 459 * @return {@literal null} if the number and the query don't match, a valid 460 * SmartDialMatchPosition with the matching positions otherwise 461 */ 462 public static SmartDialMatchPosition matchesNumber(String phoneNumber, String query, 463 boolean matchNanp) { 464 // Try matching the number as is 465 SmartDialMatchPosition matchPos = matchesNumberWithOffset(phoneNumber, query, 0); 466 if (matchPos == null) { 467 // Try matching the number without the '+' prefix, if any 468 final CountryCodeWithOffset code = SmartDialTrie.getOffsetWithoutCountryCode( 469 phoneNumber); 470 if (code != null) { 471 matchPos = matchesNumberWithOffset(phoneNumber, query, code.offset); 472 } 473 if (matchPos == null && matchNanp) { 474 // Try matching NANP numbers 475 final int[] offsets = SmartDialTrie.getOffsetForNANPNumbers(phoneNumber); 476 for (int i = 0; i < offsets.length; i++) { 477 matchPos = matchesNumberWithOffset(phoneNumber, query, offsets[i]); 478 if (matchPos != null) break; 479 } 480 } 481 } 482 return matchPos; 483 } 484 485 /** 486 * Matches a phone number against a query, taking care of formatting characters 487 * 488 * @param phoneNumber - Raw phone number 489 * @param query - Normalized query (only contains numbers from 0-9) 490 * @param offset - The position in the number to start the match against (used to ignore 491 * leading prefixes/country codes) 492 * @return {@literal null} if the number and the query don't match, a valid 493 * SmartDialMatchPosition with the matching positions otherwise 494 */ 495 private static SmartDialMatchPosition matchesNumberWithOffset(String phoneNumber, String query, 496 int offset) { 497 if (TextUtils.isEmpty(phoneNumber) || TextUtils.isEmpty(query)) { 498 return null; 499 } 500 int queryAt = 0; 501 int numberAt = offset; 502 for (int i = offset; i < phoneNumber.length(); i++) { 503 if (queryAt == query.length()) { 504 break; 505 } 506 char ch = phoneNumber.charAt(i); 507 if (ch >= '0' && ch <= '9') { 508 if (ch != query.charAt(queryAt)) { 509 return null; 510 } 511 queryAt++; 512 } else { 513 if (queryAt == 0) { 514 // Found a separator before any part of the query was matched, so advance the 515 // offset to avoid prematurely highlighting separators before the rest of the 516 // query. 517 // E.g. don't highlight the first '-' if we're matching 1-510-111-1111 with 518 // '510'. 519 // However, if the current offset is 0, just include the beginning separators 520 // anyway, otherwise the highlighting ends up looking weird. 521 // E.g. if we're matching (510)-111-1111 with '510', we should include the 522 // first '('. 523 if (offset != 0) { 524 offset++; 525 } 526 } 527 } 528 numberAt++; 529 } 530 return new SmartDialMatchPosition(0 + offset, numberAt); 531 } 532 533 /** 534 * This function iterates through each token in the display name, trying to match the query 535 * to the numeric equivalent of the token. 536 * 537 * A token is defined as a range in the display name delimited by characters that have no 538 * latin alphabet equivalents (e.g. spaces - ' ', periods - ',', underscores - '_' or chinese 539 * characters - ''). Transliteration from non-latin characters to latin character will be 540 * done on a best effort basis - e.g. '' - 'u'. 541 * 542 * For example, 543 * the display name "Phillips Thomas Jr" contains three tokens: "phillips", "thomas", and "jr". 544 * 545 * A match must begin at the start of a token. 546 * For example, typing 846(Tho) would match "Phillips Thomas", but 466(hom) would not. 547 * 548 * Also, a match can extend across tokens. 549 * For example, typing 37337(FredS) would match (Fred S)mith. 550 * 551 * @param displayName The normalized(no accented characters) display name we intend to match 552 * against. 553 * @param query The string of digits that we want to match the display name to. 554 * @param matchList An array list of {@link SmartDialMatchPosition}s that we add matched 555 * positions to. 556 * @return Returns true if a combination of the tokens in displayName match the query 557 * string contained in query. If the function returns true, matchList will contain an 558 * ArrayList of match positions (multiple matches correspond to initial matches). 559 */ 560 @VisibleForTesting 561 boolean matchesCombination(String displayName, String query, 562 ArrayList<SmartDialMatchPosition> matchList) { 563 final int nameLength = displayName.length(); 564 final int queryLength = query.length(); 565 566 if (nameLength < queryLength) { 567 return false; 568 } 569 570 if (queryLength == 0) { 571 return false; 572 } 573 574 // The current character index in displayName 575 // E.g. 3 corresponds to 'd' in "Fred Smith" 576 int nameStart = 0; 577 578 // The current character in the query we are trying to match the displayName against 579 int queryStart = 0; 580 581 // The start position of the current token we are inspecting 582 int tokenStart = 0; 583 584 // The number of non-alphabetic characters we've encountered so far in the current match. 585 // E.g. if we've currently matched 3733764849 to (Fred Smith W)illiam, then the 586 // seperatorCount should be 2. This allows us to correctly calculate offsets for the match 587 // positions 588 int seperatorCount = 0; 589 590 ArrayList<SmartDialMatchPosition> partial = new ArrayList<SmartDialMatchPosition>(); 591 // Keep going until we reach the end of displayName 592 while (nameStart < nameLength && queryStart < queryLength) { 593 char ch = displayName.charAt(nameStart); 594 // Strip diacritics from accented characters if any 595 ch = remapAccentedChars(ch); 596 if (isLowercaseLatinLetterOrDigit(ch)) { 597 if (ch >= 'a' && ch <= 'z') { 598 // a starts at index 0. If ch >= '0' && ch <= '9', we don't have to do anything 599 ch = LATIN_LETTERS_TO_DIGITS[ch - 'a']; 600 } 601 if (ch != query.charAt(queryStart)) { 602 // Failed to match the current character in the query. 603 604 // Case 1: Failed to match the first character in the query. Skip to the next 605 // token since there is no chance of this token matching the query. 606 607 // Case 2: Previous characters in the query matched, but the current character 608 // failed to match. This happened in the middle of a token. Skip to the next 609 // token since there is no chance of this token matching the query. 610 611 // Case 3: Previous characters in the query matched, but the current character 612 // failed to match. This happened right at the start of the current token. In 613 // this case, we should restart the query and try again with the current token. 614 // Otherwise, we would fail to match a query like "964"(yog) against a name 615 // Yo-Yoghurt because the query match would fail on the 3rd character, and 616 // then skip to the end of the "Yoghurt" token. 617 618 if (queryStart == 0 || isLowercaseLatinLetterOrDigit(remapAccentedChars( 619 displayName.charAt(nameStart - 1)))) { 620 // skip to the next token, in the case of 1 or 2. 621 while (nameStart < nameLength && 622 isLowercaseLatinLetterOrDigit(remapAccentedChars( 623 displayName.charAt(nameStart)))) { 624 nameStart++; 625 } 626 nameStart++; 627 } 628 629 // Restart the query and set the correct token position 630 queryStart = 0; 631 seperatorCount = 0; 632 tokenStart = nameStart; 633 } else { 634 if (queryStart == queryLength - 1) { 635 636 // As much as possible, we prioritize a full token match over a sub token 637 // one so if we find a full token match, we can return right away 638 matchList.add(new SmartDialMatchPosition( 639 tokenStart, queryLength + tokenStart + seperatorCount)); 640 return true; 641 } else if (ALLOW_INITIAL_MATCH && queryStart < INITIAL_LENGTH_LIMIT) { 642 // we matched the first character. 643 // branch off and see if we can find another match with the remaining 644 // characters in the query string and the remaining tokens 645 // find the next separator in the query string 646 int j; 647 for (j = nameStart; j < nameLength; j++) { 648 if (!isLowercaseLatinLetterOrDigit(remapAccentedChars( 649 displayName.charAt(j)))) { 650 break; 651 } 652 } 653 // this means there is at least one character left after the separator 654 if (j < nameLength - 1) { 655 final String remainder = displayName.substring(j + 1); 656 final ArrayList<SmartDialMatchPosition> partialTemp = 657 Lists.newArrayList(); 658 if (matchesCombination( 659 remainder, query.substring(queryStart + 1), partialTemp)) { 660 661 // store the list of possible match positions 662 SmartDialMatchPosition.advanceMatchPositions(partialTemp, j + 1); 663 partialTemp.add(0, 664 new SmartDialMatchPosition(nameStart, nameStart + 1)); 665 // we found a partial token match, store the data in a 666 // temp buffer and return it if we end up not finding a full 667 // token match 668 partial = partialTemp; 669 } 670 } 671 } 672 nameStart++; 673 queryStart++; 674 // we matched the current character in the name against one in the query, 675 // continue and see if the rest of the characters match 676 } 677 } else { 678 // found a separator, we skip this character and continue to the next one 679 nameStart++; 680 if (queryStart == 0) { 681 // This means we found a separator before the start of a token, 682 // so we should increment the token's start position to reflect its true 683 // start position 684 tokenStart = nameStart; 685 } else { 686 // Otherwise this separator was found in the middle of a token being matched, 687 // so increase the separator count 688 seperatorCount++; 689 } 690 } 691 } 692 // if we have no complete match at this point, then we attempt to fall back to the partial 693 // token match(if any). If we don't allow initial matching (ALLOW_INITIAL_MATCH = false) 694 // then partial will always be empty. 695 if (!partial.isEmpty()) { 696 matchList.addAll(partial); 697 return true; 698 } 699 return false; 700 } 701 702 /* 703 * Returns true if the character is a lowercase latin character or digit(i.e. non-separator). 704 */ 705 private boolean isLowercaseLatinLetterOrDigit(char ch) { 706 return (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9'); 707 } 708 709 public boolean matches(String displayName) { 710 mMatchPositions.clear(); 711 return matchesCombination(displayName, mQuery, mMatchPositions); 712 } 713 714 public ArrayList<SmartDialMatchPosition> getMatchPositions() { 715 // Return a clone of mMatchPositions so that the caller can use it without 716 // worrying about it changing 717 return new ArrayList<SmartDialMatchPosition>(mMatchPositions); 718 } 719 720 public String getQuery() { 721 return mQuery; 722 } 723 } 724