Home | History | Annotate | Download | only in webkit
      1 /*
      2  * Copyright (C) 2018 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package android.webkit;
     18 
     19 import java.util.Locale;
     20 import java.util.regex.MatchResult;
     21 import java.util.regex.Matcher;
     22 import java.util.regex.Pattern;
     23 
     24 /**
     25  * Java implementation of legacy WebView.findAddress algorithm.
     26  *
     27  * @hide
     28  */
     29 class FindAddress {
     30     static class ZipRange {
     31         int mLow;
     32         int mHigh;
     33         int mException1;
     34         int mException2;
     35         ZipRange(int low, int high, int exception1, int exception2) {
     36             mLow = low;
     37             mHigh = high;
     38             mException1 = exception1;
     39             mException2 = exception1;
     40         }
     41         boolean matches(String zipCode) {
     42             int prefix = Integer.parseInt(zipCode.substring(0, 2));
     43             return (mLow <= prefix && prefix <= mHigh) || prefix == mException1
     44                     || prefix == mException2;
     45         }
     46     }
     47 
     48     // Addresses consist of at least this many words, not including state and zip code.
     49     private static final int MIN_ADDRESS_WORDS = 4;
     50 
     51     // Adddresses consist of at most this many words, not including state and zip code.
     52     private static final int MAX_ADDRESS_WORDS = 14;
     53 
     54     // Addresses consist of at most this many lines.
     55     private static final int MAX_ADDRESS_LINES = 5;
     56 
     57     // No words in an address are longer than this many characters.
     58     private static final int kMaxAddressNameWordLength = 25;
     59 
     60     // Location name should be in the first MAX_LOCATION_NAME_DISTANCE words
     61     private static final int MAX_LOCATION_NAME_DISTANCE = 5;
     62 
     63     private static final ZipRange[] sStateZipCodeRanges = {
     64             new ZipRange(99, 99, -1, -1), // AK Alaska.
     65             new ZipRange(35, 36, -1, -1), // AL Alabama.
     66             new ZipRange(71, 72, -1, -1), // AR Arkansas.
     67             new ZipRange(96, 96, -1, -1), // AS American Samoa.
     68             new ZipRange(85, 86, -1, -1), // AZ Arizona.
     69             new ZipRange(90, 96, -1, -1), // CA California.
     70             new ZipRange(80, 81, -1, -1), // CO Colorado.
     71             new ZipRange(6, 6, -1, -1), // CT Connecticut.
     72             new ZipRange(20, 20, -1, -1), // DC District of Columbia.
     73             new ZipRange(19, 19, -1, -1), // DE Delaware.
     74             new ZipRange(32, 34, -1, -1), // FL Florida.
     75             new ZipRange(96, 96, -1, -1), // FM Federated States of Micronesia.
     76             new ZipRange(30, 31, -1, -1), // GA Georgia.
     77             new ZipRange(96, 96, -1, -1), // GU Guam.
     78             new ZipRange(96, 96, -1, -1), // HI Hawaii.
     79             new ZipRange(50, 52, -1, -1), // IA Iowa.
     80             new ZipRange(83, 83, -1, -1), // ID Idaho.
     81             new ZipRange(60, 62, -1, -1), // IL Illinois.
     82             new ZipRange(46, 47, -1, -1), // IN Indiana.
     83             new ZipRange(66, 67, 73, -1), // KS Kansas.
     84             new ZipRange(40, 42, -1, -1), // KY Kentucky.
     85             new ZipRange(70, 71, -1, -1), // LA Louisiana.
     86             new ZipRange(1, 2, -1, -1), // MA Massachusetts.
     87             new ZipRange(20, 21, -1, -1), // MD Maryland.
     88             new ZipRange(3, 4, -1, -1), // ME Maine.
     89             new ZipRange(96, 96, -1, -1), // MH Marshall Islands.
     90             new ZipRange(48, 49, -1, -1), // MI Michigan.
     91             new ZipRange(55, 56, -1, -1), // MN Minnesota.
     92             new ZipRange(63, 65, -1, -1), // MO Missouri.
     93             new ZipRange(96, 96, -1, -1), // MP Northern Mariana Islands.
     94             new ZipRange(38, 39, -1, -1), // MS Mississippi.
     95             new ZipRange(55, 56, -1, -1), // MT Montana.
     96             new ZipRange(27, 28, -1, -1), // NC North Carolina.
     97             new ZipRange(58, 58, -1, -1), // ND North Dakota.
     98             new ZipRange(68, 69, -1, -1), // NE Nebraska.
     99             new ZipRange(3, 4, -1, -1), // NH New Hampshire.
    100             new ZipRange(7, 8, -1, -1), // NJ New Jersey.
    101             new ZipRange(87, 88, 86, -1), // NM New Mexico.
    102             new ZipRange(88, 89, 96, -1), // NV Nevada.
    103             new ZipRange(10, 14, 0, 6), // NY New York.
    104             new ZipRange(43, 45, -1, -1), // OH Ohio.
    105             new ZipRange(73, 74, -1, -1), // OK Oklahoma.
    106             new ZipRange(97, 97, -1, -1), // OR Oregon.
    107             new ZipRange(15, 19, -1, -1), // PA Pennsylvania.
    108             new ZipRange(6, 6, 0, 9), // PR Puerto Rico.
    109             new ZipRange(96, 96, -1, -1), // PW Palau.
    110             new ZipRange(2, 2, -1, -1), // RI Rhode Island.
    111             new ZipRange(29, 29, -1, -1), // SC South Carolina.
    112             new ZipRange(57, 57, -1, -1), // SD South Dakota.
    113             new ZipRange(37, 38, -1, -1), // TN Tennessee.
    114             new ZipRange(75, 79, 87, 88), // TX Texas.
    115             new ZipRange(84, 84, -1, -1), // UT Utah.
    116             new ZipRange(22, 24, 20, -1), // VA Virginia.
    117             new ZipRange(6, 9, -1, -1), // VI Virgin Islands.
    118             new ZipRange(5, 5, -1, -1), // VT Vermont.
    119             new ZipRange(98, 99, -1, -1), // WA Washington.
    120             new ZipRange(53, 54, -1, -1), // WI Wisconsin.
    121             new ZipRange(24, 26, -1, -1), // WV West Virginia.
    122             new ZipRange(82, 83, -1, -1) // WY Wyoming.
    123     };
    124 
    125     // Newlines
    126     private static final String NL = "\n\u000B\u000C\r\u0085\u2028\u2029";
    127 
    128     // Space characters
    129     private static final String SP = "\u0009\u0020\u00A0\u1680\u2000\u2001"
    130             + "\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u202F"
    131             + "\u205F\u3000";
    132 
    133     // Whitespace
    134     private static final String WS = SP + NL;
    135 
    136     // Characters that are considered word delimiters.
    137     private static final String WORD_DELIM = ",*\u2022" + WS;
    138 
    139     // Lookahead for word end.
    140     private static final String WORD_END = "(?=[" + WORD_DELIM + "]|$)";
    141 
    142     // Address words are a sequence of non-delimiter characters.
    143     private static final Pattern sWordRe =
    144             Pattern.compile("[^" + WORD_DELIM + "]+" + WORD_END, Pattern.CASE_INSENSITIVE);
    145 
    146     // Characters that are considered suffix delimiters for house numbers.
    147     private static final String HOUSE_POST_DELIM = ",\"'" + WS;
    148 
    149     // Lookahead for house end.
    150     private static final String HOUSE_END = "(?=[" + HOUSE_POST_DELIM + "]|$)";
    151 
    152     // Characters that are considered prefix delimiters for house numbers.
    153     private static final String HOUSE_PRE_DELIM = ":" + HOUSE_POST_DELIM;
    154 
    155     // A house number component is "one" or a number, optionally
    156     // followed by a single alphabetic character, or
    157     private static final String HOUSE_COMPONENT = "(?:one|\\d+([a-z](?=[^a-z]|$)|st|nd|rd|th)?)";
    158 
    159     // House numbers are a repetition of |HOUSE_COMPONENT|, separated by -, and followed by
    160     // a delimiter character.
    161     private static final Pattern sHouseNumberRe =
    162             Pattern.compile(HOUSE_COMPONENT + "(?:-" + HOUSE_COMPONENT + ")*" + HOUSE_END,
    163                     Pattern.CASE_INSENSITIVE);
    164 
    165     // XXX: do we want to accept whitespace other than 0x20 in state names?
    166     private static final Pattern sStateRe = Pattern.compile("(?:"
    167                     + "(ak|alaska)|"
    168                     + "(al|alabama)|"
    169                     + "(ar|arkansas)|"
    170                     + "(as|american[" + SP + "]+samoa)|"
    171                     + "(az|arizona)|"
    172                     + "(ca|california)|"
    173                     + "(co|colorado)|"
    174                     + "(ct|connecticut)|"
    175                     + "(dc|district[" + SP + "]+of[" + SP + "]+columbia)|"
    176                     + "(de|delaware)|"
    177                     + "(fl|florida)|"
    178                     + "(fm|federated[" + SP + "]+states[" + SP + "]+of[" + SP + "]+micronesia)|"
    179                     + "(ga|georgia)|"
    180                     + "(gu|guam)|"
    181                     + "(hi|hawaii)|"
    182                     + "(ia|iowa)|"
    183                     + "(id|idaho)|"
    184                     + "(il|illinois)|"
    185                     + "(in|indiana)|"
    186                     + "(ks|kansas)|"
    187                     + "(ky|kentucky)|"
    188                     + "(la|louisiana)|"
    189                     + "(ma|massachusetts)|"
    190                     + "(md|maryland)|"
    191                     + "(me|maine)|"
    192                     + "(mh|marshall[" + SP + "]+islands)|"
    193                     + "(mi|michigan)|"
    194                     + "(mn|minnesota)|"
    195                     + "(mo|missouri)|"
    196                     + "(mp|northern[" + SP + "]+mariana[" + SP + "]+islands)|"
    197                     + "(ms|mississippi)|"
    198                     + "(mt|montana)|"
    199                     + "(nc|north[" + SP + "]+carolina)|"
    200                     + "(nd|north[" + SP + "]+dakota)|"
    201                     + "(ne|nebraska)|"
    202                     + "(nh|new[" + SP + "]+hampshire)|"
    203                     + "(nj|new[" + SP + "]+jersey)|"
    204                     + "(nm|new[" + SP + "]+mexico)|"
    205                     + "(nv|nevada)|"
    206                     + "(ny|new[" + SP + "]+york)|"
    207                     + "(oh|ohio)|"
    208                     + "(ok|oklahoma)|"
    209                     + "(or|oregon)|"
    210                     + "(pa|pennsylvania)|"
    211                     + "(pr|puerto[" + SP + "]+rico)|"
    212                     + "(pw|palau)|"
    213                     + "(ri|rhode[" + SP + "]+island)|"
    214                     + "(sc|south[" + SP + "]+carolina)|"
    215                     + "(sd|south[" + SP + "]+dakota)|"
    216                     + "(tn|tennessee)|"
    217                     + "(tx|texas)|"
    218                     + "(ut|utah)|"
    219                     + "(va|virginia)|"
    220                     + "(vi|virgin[" + SP + "]+islands)|"
    221                     + "(vt|vermont)|"
    222                     + "(wa|washington)|"
    223                     + "(wi|wisconsin)|"
    224                     + "(wv|west[" + SP + "]+virginia)|"
    225                     + "(wy|wyoming)"
    226                     + ")" + WORD_END,
    227             Pattern.CASE_INSENSITIVE);
    228 
    229     private static final Pattern sLocationNameRe = Pattern.compile("(?:"
    230                     + "alley|annex|arcade|ave[.]?|avenue|alameda|bayou|"
    231                     + "beach|bend|bluffs?|bottom|boulevard|branch|bridge|"
    232                     + "brooks?|burgs?|bypass|broadway|camino|camp|canyon|"
    233                     + "cape|causeway|centers?|circles?|cliffs?|club|common|"
    234                     + "corners?|course|courts?|coves?|creek|crescent|crest|"
    235                     + "crossing|crossroad|curve|circulo|dale|dam|divide|"
    236                     + "drives?|estates?|expressway|extensions?|falls?|ferry|"
    237                     + "fields?|flats?|fords?|forest|forges?|forks?|fort|"
    238                     + "freeway|gardens?|gateway|glens?|greens?|groves?|"
    239                     + "harbors?|haven|heights|highway|hills?|hollow|inlet|"
    240                     + "islands?|isle|junctions?|keys?|knolls?|lakes?|land|"
    241                     + "landing|lane|lights?|loaf|locks?|lodge|loop|mall|"
    242                     + "manors?|meadows?|mews|mills?|mission|motorway|mount|"
    243                     + "mountains?|neck|orchard|oval|overpass|parks?|"
    244                     + "parkways?|pass|passage|path|pike|pines?|plains?|"
    245                     + "plaza|points?|ports?|prairie|privada|radial|ramp|"
    246                     + "ranch|rapids?|rd[.]?|rest|ridges?|river|roads?|route|"
    247                     + "row|rue|run|shoals?|shores?|skyway|springs?|spurs?|"
    248                     + "squares?|station|stravenue|stream|st[.]?|streets?|"
    249                     + "summit|speedway|terrace|throughway|trace|track|"
    250                     + "trafficway|trail|tunnel|turnpike|underpass|unions?|"
    251                     + "valleys?|viaduct|views?|villages?|ville|vista|walks?|"
    252                     + "wall|ways?|wells?|xing|xrd)" + WORD_END,
    253             Pattern.CASE_INSENSITIVE);
    254 
    255     private static final Pattern sSuffixedNumberRe =
    256             Pattern.compile("(\\d+)(st|nd|rd|th)", Pattern.CASE_INSENSITIVE);
    257 
    258     private static final Pattern sZipCodeRe =
    259             Pattern.compile("(?:\\d{5}(?:-\\d{4})?)" + WORD_END, Pattern.CASE_INSENSITIVE);
    260 
    261     private static boolean checkHouseNumber(String houseNumber) {
    262         // Make sure that there are at most 5 digits.
    263         int digitCount = 0;
    264         for (int i = 0; i < houseNumber.length(); ++i) {
    265             if (Character.isDigit(houseNumber.charAt(i))) ++digitCount;
    266         }
    267         if (digitCount > 5) return false;
    268 
    269         // Make sure that any ordinals are valid.
    270         Matcher suffixMatcher = sSuffixedNumberRe.matcher(houseNumber);
    271         while (suffixMatcher.find()) {
    272             int num = Integer.parseInt(suffixMatcher.group(1));
    273             if (num == 0) {
    274                 return false; // 0th is invalid.
    275             }
    276             String suffix = suffixMatcher.group(2).toLowerCase(Locale.getDefault());
    277             switch (num % 10) {
    278                 case 1:
    279                     return suffix.equals(num % 100 == 11 ? "th" : "st");
    280                 case 2:
    281                     return suffix.equals(num % 100 == 12 ? "th" : "nd");
    282                 case 3:
    283                     return suffix.equals(num % 100 == 13 ? "th" : "rd");
    284                 default:
    285                     return suffix.equals("th");
    286             }
    287         }
    288         return true;
    289     }
    290 
    291     /**
    292      * Attempt to match a house number beginnning at position offset
    293      * in content.  The house number must be followed by a word
    294      * delimiter or the end of the string, and if offset is non-zero,
    295      * then it must also be preceded by a word delimiter.
    296      *
    297      * @return a MatchResult if a valid house number was found.
    298      */
    299     private static MatchResult matchHouseNumber(String content, int offset) {
    300         if (offset > 0 && HOUSE_PRE_DELIM.indexOf(content.charAt(offset - 1)) == -1) return null;
    301         Matcher matcher = sHouseNumberRe.matcher(content).region(offset, content.length());
    302         if (matcher.lookingAt()) {
    303             MatchResult matchResult = matcher.toMatchResult();
    304             if (checkHouseNumber(matchResult.group(0))) return matchResult;
    305         }
    306         return null;
    307     }
    308 
    309     /**
    310      * Attempt to match a US state beginnning at position offset in
    311      * content.  The matching state must be followed by a word
    312      * delimiter or the end of the string, and if offset is non-zero,
    313      * then it must also be preceded by a word delimiter.
    314      *
    315      * @return a MatchResult if a valid US state (or two letter code)
    316      * was found.
    317      */
    318     private static MatchResult matchState(String content, int offset) {
    319         if (offset > 0 && WORD_DELIM.indexOf(content.charAt(offset - 1)) == -1) return null;
    320         Matcher stateMatcher = sStateRe.matcher(content).region(offset, content.length());
    321         return stateMatcher.lookingAt() ? stateMatcher.toMatchResult() : null;
    322     }
    323 
    324     /**
    325      * Test whether zipCode matches the U.S. zip code format (ddddd or
    326      * ddddd-dddd) and is within the expected range, given that
    327      * stateMatch is a match of sStateRe.
    328      *
    329      * @return true if zipCode is a valid zip code, is legal for the
    330      * matched state, and is followed by a word delimiter or the end
    331      * of the string.
    332      */
    333     private static boolean isValidZipCode(String zipCode, MatchResult stateMatch) {
    334         if (stateMatch == null) return false;
    335         // Work out the index of the state, based on which group matched.
    336         int stateIndex = stateMatch.groupCount();
    337         while (stateIndex > 0) {
    338             if (stateMatch.group(stateIndex--) != null) break;
    339         }
    340         return sZipCodeRe.matcher(zipCode).matches()
    341                 && sStateZipCodeRanges[stateIndex].matches(zipCode);
    342     }
    343 
    344     /**
    345      * Test whether location is one of the valid locations.
    346      *
    347      * @return true if location starts with a valid location name
    348      * followed by a word delimiter or the end of the string.
    349      */
    350     private static boolean isValidLocationName(String location) {
    351         return sLocationNameRe.matcher(location).matches();
    352     }
    353 
    354     /**
    355      * Attempt to match a complete address in content, starting with
    356      * houseNumberMatch.
    357      *
    358      * @param content The string to search.
    359      * @param houseNumberMatch A matching house number to start extending.
    360      * @return +ve: the end of the match
    361      *         +ve: the position to restart searching for house numbers, negated.
    362      */
    363     private static int attemptMatch(String content, MatchResult houseNumberMatch) {
    364         int restartPos = -1;
    365         int nonZipMatch = -1;
    366         int it = houseNumberMatch.end();
    367         int numLines = 1;
    368         boolean consecutiveHouseNumbers = true;
    369         boolean foundLocationName = false;
    370         int wordCount = 1;
    371         String lastWord = "";
    372 
    373         Matcher matcher = sWordRe.matcher(content);
    374 
    375         for (; it < content.length(); lastWord = matcher.group(0), it = matcher.end()) {
    376             if (!matcher.find(it)) {
    377                 // No more words in the input sequence.
    378                 return -content.length();
    379             }
    380             if (matcher.end() - matcher.start() > kMaxAddressNameWordLength) {
    381                 // Word is too long to be part of an address. Fail.
    382                 return -matcher.end();
    383             }
    384 
    385             // Count the number of newlines we just consumed.
    386             while (it < matcher.start()) {
    387                 if (NL.indexOf(content.charAt(it++)) != -1) ++numLines;
    388             }
    389 
    390             // Consumed too many lines. Fail.
    391             if (numLines > MAX_ADDRESS_LINES) break;
    392 
    393             // Consumed too many words. Fail.
    394             if (++wordCount > MAX_ADDRESS_WORDS) break;
    395 
    396             if (matchHouseNumber(content, it) != null) {
    397                 if (consecutiveHouseNumbers && numLines > 1) {
    398                     // Last line ended with a number, and this this line starts with one.
    399                     // Restart at this number.
    400                     return -it;
    401                 }
    402                 // Remember the position of this match as the restart position.
    403                 if (restartPos == -1) restartPos = it;
    404                 continue;
    405             }
    406 
    407             consecutiveHouseNumbers = false;
    408 
    409             if (isValidLocationName(matcher.group(0))) {
    410                 foundLocationName = true;
    411                 continue;
    412             }
    413 
    414             if (wordCount == MAX_LOCATION_NAME_DISTANCE && !foundLocationName) {
    415                 // Didn't find a location name in time. Fail.
    416                 it = matcher.end();
    417                 break;
    418             }
    419 
    420             if (foundLocationName && wordCount > MIN_ADDRESS_WORDS) {
    421                 // We can now attempt to match a state.
    422                 MatchResult stateMatch = matchState(content, it);
    423                 if (stateMatch != null) {
    424                     if (lastWord.equals("et") && stateMatch.group(0).equals("al")) {
    425                         // Reject "et al" as a false postitive.
    426                         it = stateMatch.end();
    427                         break;
    428                     }
    429 
    430                     // At this point we've matched a state; try to match a zip code after it.
    431                     Matcher zipMatcher = sWordRe.matcher(content);
    432                     if (zipMatcher.find(stateMatch.end())) {
    433                         if (isValidZipCode(zipMatcher.group(0), stateMatch)) {
    434                             return zipMatcher.end();
    435                         }
    436                     } else {
    437                         // The content ends with a state but no zip
    438                         // code. This is a legal match according to the
    439                         // documentation. N.B. This is equivalent to the
    440                         // original c++ implementation, which only allowed
    441                         // the zip code to be optional at the end of the
    442                         // string, which presumably is a bug.  We tried
    443                         // relaxing this to work in other places but it
    444                         // caused too many false positives.
    445                         nonZipMatch = stateMatch.end();
    446                     }
    447                 }
    448             }
    449         }
    450 
    451         if (nonZipMatch > 0) return nonZipMatch;
    452 
    453         return -(restartPos > 0 ? restartPos : it);
    454     }
    455 
    456     /**
    457      * Return the first matching address in content.
    458      *
    459      * @param content The string to search.
    460      * @return The first valid address, or null if no address was matched.
    461      */
    462     static String findAddress(String content) {
    463         Matcher houseNumberMatcher = sHouseNumberRe.matcher(content);
    464         int start = 0;
    465         while (houseNumberMatcher.find(start)) {
    466             if (checkHouseNumber(houseNumberMatcher.group(0))) {
    467                 start = houseNumberMatcher.start();
    468                 int end = attemptMatch(content, houseNumberMatcher);
    469                 if (end > 0) {
    470                     return content.substring(start, end);
    471                 }
    472                 start = -end;
    473             } else {
    474                 start = houseNumberMatcher.end();
    475             }
    476         }
    477         return null;
    478     }
    479 }
    480