Home | History | Annotate | Download | only in util
      1 /*
      2  * Copyright (C) 2007 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package android.util;
     18 
     19 import java.util.regex.Matcher;
     20 import java.util.regex.Pattern;
     21 
     22 /**
     23  * Commonly used regular expression patterns.
     24  */
     25 public class Patterns {
     26     /**
     27      *  Regular expression to match all IANA top-level domains.
     28      *  List accurate as of 2011/07/18.  List taken from:
     29      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     30      *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
     31      */
     32     public static final String TOP_LEVEL_DOMAIN_STR =
     33         "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
     34         + "|(biz|b[abdefghijmnorstvwyz])"
     35         + "|(cat|com|coop|c[acdfghiklmnoruvxyz])"
     36         + "|d[ejkmoz]"
     37         + "|(edu|e[cegrstu])"
     38         + "|f[ijkmor]"
     39         + "|(gov|g[abdefghilmnpqrstuwy])"
     40         + "|h[kmnrtu]"
     41         + "|(info|int|i[delmnoqrst])"
     42         + "|(jobs|j[emop])"
     43         + "|k[eghimnprwyz]"
     44         + "|l[abcikrstuvy]"
     45         + "|(mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
     46         + "|(name|net|n[acefgilopruz])"
     47         + "|(org|om)"
     48         + "|(pro|p[aefghklmnrstwy])"
     49         + "|qa"
     50         + "|r[eosuw]"
     51         + "|s[abcdeghijklmnortuvyz]"
     52         + "|(tel|travel|t[cdfghjklmnoprtvwz])"
     53         + "|u[agksyz]"
     54         + "|v[aceginu]"
     55         + "|w[fs]"
     56         + "|(\u03b4\u03bf\u03ba\u03b9\u03bc\u03ae|\u0438\u0441\u043f\u044b\u0442\u0430\u043d\u0438\u0435|\u0440\u0444|\u0441\u0440\u0431|\u05d8\u05e2\u05e1\u05d8|\u0622\u0632\u0645\u0627\u06cc\u0634\u06cc|\u0625\u062e\u062a\u0628\u0627\u0631|\u0627\u0644\u0627\u0631\u062f\u0646|\u0627\u0644\u062c\u0632\u0627\u0626\u0631|\u0627\u0644\u0633\u0639\u0648\u062f\u064a\u0629|\u0627\u0644\u0645\u063a\u0631\u0628|\u0627\u0645\u0627\u0631\u0627\u062a|\u0628\u06be\u0627\u0631\u062a|\u062a\u0648\u0646\u0633|\u0633\u0648\u0631\u064a\u0629|\u0641\u0644\u0633\u0637\u064a\u0646|\u0642\u0637\u0631|\u0645\u0635\u0631|\u092a\u0930\u0940\u0915\u094d\u0937\u093e|\u092d\u093e\u0930\u0924|\u09ad\u09be\u09b0\u09a4|\u0a2d\u0a3e\u0a30\u0a24|\u0aad\u0abe\u0ab0\u0aa4|\u0b87\u0ba8\u0bcd\u0ba4\u0bbf\u0baf\u0bbe|\u0b87\u0bb2\u0b99\u0bcd\u0b95\u0bc8|\u0b9a\u0bbf\u0b99\u0bcd\u0b95\u0baa\u0bcd\u0baa\u0bc2\u0bb0\u0bcd|\u0baa\u0bb0\u0bbf\u0b9f\u0bcd\u0b9a\u0bc8|\u0c2d\u0c3e\u0c30\u0c24\u0c4d|\u0dbd\u0d82\u0d9a\u0dcf|\u0e44\u0e17\u0e22|\u30c6\u30b9\u30c8|\u4e2d\u56fd|\u4e2d\u570b|\u53f0\u6e7e|\u53f0\u7063|\u65b0\u52a0\u5761|\u6d4b\u8bd5|\u6e2c\u8a66|\u9999\u6e2f|\ud14c\uc2a4\ud2b8|\ud55c\uad6d|xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-3e0b707e|xn\\-\\-45brj9c|xn\\-\\-80akhbyknj4f|xn\\-\\-90a3ac|xn\\-\\-9t4b11yi5a|xn\\-\\-clchc0ea0b2g2a9gcd|xn\\-\\-deba0ad|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fpcrj9c3d|xn\\-\\-fzc2c9e2c|xn\\-\\-g6w251d|xn\\-\\-gecrj9c|xn\\-\\-h2brj9c|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-j6w193g|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-kprw13d|xn\\-\\-kpry57d|xn\\-\\-lgbbat1ad8j|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbayh7gpa|xn\\-\\-mgbbh1a71e|xn\\-\\-mgbc0a9azcg|xn\\-\\-mgberp4a5d4ar|xn\\-\\-o3cw4h|xn\\-\\-ogbpf8fl|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-s9brj9c|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a|xn\\-\\-xkc2al3hye2a|xn\\-\\-xkc2dl3a5ee0h|xn\\-\\-yfro4i67o|xn\\-\\-ygbi2ammx|xn\\-\\-zckzah|xxx)"
     57         + "|y[et]"
     58         + "|z[amw])";
     59 
     60     /**
     61      *  Regular expression pattern to match all IANA top-level domains.
     62      */
     63     public static final Pattern TOP_LEVEL_DOMAIN =
     64         Pattern.compile(TOP_LEVEL_DOMAIN_STR);
     65 
     66     /**
     67      *  Regular expression to match all IANA top-level domains for WEB_URL.
     68      *  List accurate as of 2011/07/18.  List taken from:
     69      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     70      *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
     71      */
     72     public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
     73         "(?:"
     74         + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
     75         + "|(?:biz|b[abdefghijmnorstvwyz])"
     76         + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])"
     77         + "|d[ejkmoz]"
     78         + "|(?:edu|e[cegrstu])"
     79         + "|f[ijkmor]"
     80         + "|(?:gov|g[abdefghilmnpqrstuwy])"
     81         + "|h[kmnrtu]"
     82         + "|(?:info|int|i[delmnoqrst])"
     83         + "|(?:jobs|j[emop])"
     84         + "|k[eghimnprwyz]"
     85         + "|l[abcikrstuvy]"
     86         + "|(?:mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
     87         + "|(?:name|net|n[acefgilopruz])"
     88         + "|(?:org|om)"
     89         + "|(?:pro|p[aefghklmnrstwy])"
     90         + "|qa"
     91         + "|r[eosuw]"
     92         + "|s[abcdeghijklmnortuvyz]"
     93         + "|(?:tel|travel|t[cdfghjklmnoprtvwz])"
     94         + "|u[agksyz]"
     95         + "|v[aceginu]"
     96         + "|w[fs]"
     97         + "|(?:\u03b4\u03bf\u03ba\u03b9\u03bc\u03ae|\u0438\u0441\u043f\u044b\u0442\u0430\u043d\u0438\u0435|\u0440\u0444|\u0441\u0440\u0431|\u05d8\u05e2\u05e1\u05d8|\u0622\u0632\u0645\u0627\u06cc\u0634\u06cc|\u0625\u062e\u062a\u0628\u0627\u0631|\u0627\u0644\u0627\u0631\u062f\u0646|\u0627\u0644\u062c\u0632\u0627\u0626\u0631|\u0627\u0644\u0633\u0639\u0648\u062f\u064a\u0629|\u0627\u0644\u0645\u063a\u0631\u0628|\u0627\u0645\u0627\u0631\u0627\u062a|\u0628\u06be\u0627\u0631\u062a|\u062a\u0648\u0646\u0633|\u0633\u0648\u0631\u064a\u0629|\u0641\u0644\u0633\u0637\u064a\u0646|\u0642\u0637\u0631|\u0645\u0635\u0631|\u092a\u0930\u0940\u0915\u094d\u0937\u093e|\u092d\u093e\u0930\u0924|\u09ad\u09be\u09b0\u09a4|\u0a2d\u0a3e\u0a30\u0a24|\u0aad\u0abe\u0ab0\u0aa4|\u0b87\u0ba8\u0bcd\u0ba4\u0bbf\u0baf\u0bbe|\u0b87\u0bb2\u0b99\u0bcd\u0b95\u0bc8|\u0b9a\u0bbf\u0b99\u0bcd\u0b95\u0baa\u0bcd\u0baa\u0bc2\u0bb0\u0bcd|\u0baa\u0bb0\u0bbf\u0b9f\u0bcd\u0b9a\u0bc8|\u0c2d\u0c3e\u0c30\u0c24\u0c4d|\u0dbd\u0d82\u0d9a\u0dcf|\u0e44\u0e17\u0e22|\u30c6\u30b9\u30c8|\u4e2d\u56fd|\u4e2d\u570b|\u53f0\u6e7e|\u53f0\u7063|\u65b0\u52a0\u5761|\u6d4b\u8bd5|\u6e2c\u8a66|\u9999\u6e2f|\ud14c\uc2a4\ud2b8|\ud55c\uad6d|xn\\-\\-0zwm56d|xn\\-\\-11b5bs3a9aj6g|xn\\-\\-3e0b707e|xn\\-\\-45brj9c|xn\\-\\-80akhbyknj4f|xn\\-\\-90a3ac|xn\\-\\-9t4b11yi5a|xn\\-\\-clchc0ea0b2g2a9gcd|xn\\-\\-deba0ad|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fpcrj9c3d|xn\\-\\-fzc2c9e2c|xn\\-\\-g6w251d|xn\\-\\-gecrj9c|xn\\-\\-h2brj9c|xn\\-\\-hgbk6aj7f53bba|xn\\-\\-hlcj6aya9esc7a|xn\\-\\-j6w193g|xn\\-\\-jxalpdlp|xn\\-\\-kgbechtv|xn\\-\\-kprw13d|xn\\-\\-kpry57d|xn\\-\\-lgbbat1ad8j|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbayh7gpa|xn\\-\\-mgbbh1a71e|xn\\-\\-mgbc0a9azcg|xn\\-\\-mgberp4a5d4ar|xn\\-\\-o3cw4h|xn\\-\\-ogbpf8fl|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-s9brj9c|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a|xn\\-\\-xkc2al3hye2a|xn\\-\\-xkc2dl3a5ee0h|xn\\-\\-yfro4i67o|xn\\-\\-ygbi2ammx|xn\\-\\-zckzah|xxx)"
     98         + "|y[et]"
     99         + "|z[amw]))";
    100 
    101     /**
    102      * Good characters for Internationalized Resource Identifiers (IRI).
    103      * This comprises most common used Unicode characters allowed in IRI
    104      * as detailed in RFC 3987.
    105      * Specifically, those two byte Unicode characters are not included.
    106      */
    107     public static final String GOOD_IRI_CHAR =
    108         "a-zA-Z0-9\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF";
    109 
    110     /**
    111      *  Regular expression pattern to match most part of RFC 3987
    112      *  Internationalized URLs, aka IRIs.  Commonly used Unicode characters are
    113      *  added.
    114      */
    115     public static final Pattern WEB_URL = Pattern.compile(
    116         "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
    117         + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
    118         + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
    119         + "((?:(?:[" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]{0,64}\\.)+"   // named host
    120         + TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL
    121         + "|(?:(?:25[0-5]|2[0-4]" // or ip address
    122         + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
    123         + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
    124         + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
    125         + "|[1-9][0-9]|[0-9])))"
    126         + "(?:\\:\\d{1,5})?)" // plus option port number
    127         + "(\\/(?:(?:[" + GOOD_IRI_CHAR + "\\;\\/\\?\\:\\@\\&\\=\\#\\~"  // plus option query params
    128         + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
    129         + "(?:\\b|$)"); // and finally, a word boundary or end of
    130                         // input.  This is to stop foo.sure from
    131                         // matching as foo.su
    132 
    133     public static final Pattern IP_ADDRESS
    134         = Pattern.compile(
    135             "((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4]"
    136             + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]"
    137             + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
    138             + "|[1-9][0-9]|[0-9]))");
    139 
    140     public static final Pattern DOMAIN_NAME
    141         = Pattern.compile(
    142             "(((([" + GOOD_IRI_CHAR + "][" + GOOD_IRI_CHAR + "\\-]*)*[" + GOOD_IRI_CHAR + "]\\.)+"
    143             + TOP_LEVEL_DOMAIN + ")|"
    144             + IP_ADDRESS + ")");
    145 
    146     public static final Pattern EMAIL_ADDRESS
    147         = Pattern.compile(
    148             "[a-zA-Z0-9\\+\\.\\_\\%\\-\\+]{1,256}" +
    149             "\\@" +
    150             "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}" +
    151             "(" +
    152                 "\\." +
    153                 "[a-zA-Z0-9][a-zA-Z0-9\\-]{0,25}" +
    154             ")+"
    155         );
    156 
    157     /**
    158      * This pattern is intended for searching for things that look like they
    159      * might be phone numbers in arbitrary text, not for validating whether
    160      * something is in fact a phone number.  It will miss many things that
    161      * are legitimate phone numbers.
    162      *
    163      * <p> The pattern matches the following:
    164      * <ul>
    165      * <li>Optionally, a + sign followed immediately by one or more digits. Spaces, dots, or dashes
    166      * may follow.
    167      * <li>Optionally, sets of digits in parentheses, separated by spaces, dots, or dashes.
    168      * <li>A string starting and ending with a digit, containing digits, spaces, dots, and/or dashes.
    169      * </ul>
    170      */
    171     public static final Pattern PHONE
    172         = Pattern.compile(                      // sdd = space, dot, or dash
    173                 "(\\+[0-9]+[\\- \\.]*)?"        // +<digits><sdd>*
    174                 + "(\\([0-9]+\\)[\\- \\.]*)?"   // (<digits>)<sdd>*
    175                 + "([0-9][0-9\\- \\.]+[0-9])"); // <digit><digit|sdd>+<digit>
    176 
    177     /**
    178      *  Convenience method to take all of the non-null matching groups in a
    179      *  regex Matcher and return them as a concatenated string.
    180      *
    181      *  @param matcher      The Matcher object from which grouped text will
    182      *                      be extracted
    183      *
    184      *  @return             A String comprising all of the non-null matched
    185      *                      groups concatenated together
    186      */
    187     public static final String concatGroups(Matcher matcher) {
    188         StringBuilder b = new StringBuilder();
    189         final int numGroups = matcher.groupCount();
    190 
    191         for (int i = 1; i <= numGroups; i++) {
    192             String s = matcher.group(i);
    193 
    194             System.err.println("Group(" + i + ") : " + s);
    195 
    196             if (s != null) {
    197                 b.append(s);
    198             }
    199         }
    200 
    201         return b.toString();
    202     }
    203 
    204     /**
    205      * Convenience method to return only the digits and plus signs
    206      * in the matching string.
    207      *
    208      * @param matcher      The Matcher object from which digits and plus will
    209      *                     be extracted
    210      *
    211      * @return             A String comprising all of the digits and plus in
    212      *                     the match
    213      */
    214     public static final String digitsAndPlusOnly(Matcher matcher) {
    215         StringBuilder buffer = new StringBuilder();
    216         String matchingRegion = matcher.group();
    217 
    218         for (int i = 0, size = matchingRegion.length(); i < size; i++) {
    219             char character = matchingRegion.charAt(i);
    220 
    221             if (character == '+' || Character.isDigit(character)) {
    222                 buffer.append(character);
    223             }
    224         }
    225         return buffer.toString();
    226     }
    227 
    228     /**
    229      * Do not create this static utility class.
    230      */
    231     private Patterns() {}
    232 }
    233