Home | History | Annotate | Download | only in util
      1 /*
      2  * Copyright (C) 2007 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     17 package com.google.android.util;
     19 import java.util.ArrayList;
     20 import java.util.HashMap;
     21 import java.util.LinkedHashMap;
     22 import java.util.regex.Matcher;
     23 import java.util.regex.Pattern;
     24 import java.util.Set;
     25 import java.util.List;
     27 /**
     28  *
     29  * Logic for parsing a text message typed by the user looking for smileys,
     30  * urls, acronyms,formatting (e.g., '*'s for bold), me commands
     31  * (e.g., "/me is asleep"), and punctuation.
     32  *
     33  * It constructs an array, which breaks the text up into its
     34  * constituent pieces, which we return to the client.
     35  *
     36  */
     37 public abstract class AbstractMessageParser {
     38 /**
     39  * Interface representing the set of resources needed by a message parser
     40  *
     41  * @author jessan (Jessan Hutchison-Quillian)
     42  */
     43   public static interface Resources {
     45     /** Get the known set of URL schemes. */
     46     public Set<String> getSchemes();
     48     /** Get the possible values for the last part of a domain name.
     49      *  Values are expected to be reversed in the Trie.
     50      */
     51     public TrieNode getDomainSuffixes();
     53     /** Get the smileys accepted by the parser. */
     54     public TrieNode getSmileys();
     56     /** Get the acronyms accepted by the parser. */
     57     public TrieNode getAcronyms();
     58   }
     60   /**
     61    * Subclasses must define the schemes, domains, smileys and acronyms
     62    * that are necessary for parsing
     63    */
     64   protected abstract Resources getResources();
     66   /** Music note that indicates user is listening to a music track. */
     67   public static final String musicNote = "\u266B ";
     69   private String text;
     70   private int nextChar;
     71   private int nextClass;
     72   private ArrayList<Part> parts;
     73   private ArrayList<Token> tokens;
     74   private HashMap<Character,Format> formatStart;
     75   private boolean parseSmilies;
     76   private boolean parseAcronyms;
     77   private boolean parseFormatting;
     78   private boolean parseUrls;
     79   private boolean parseMeText;
     80   private boolean parseMusic;
     82   /**
     83    * Create a message parser to parse urls, formatting, acronyms, smileys,
     84    * /me text and  music
     85    *
     86    * @param text the text to parse
     87    */
     88   public AbstractMessageParser(String text) {
     89     this(text, true, true, true, true, true, true);
     90   }
     92   /**
     93    * Create a message parser, specifying the kinds of text to parse
     94    *
     95    * @param text the text to parse
     96    *
     97    */
     98   public AbstractMessageParser(String text, boolean parseSmilies,
     99       boolean parseAcronyms, boolean parseFormatting, boolean parseUrls,
    100       boolean parseMusic, boolean parseMeText) {
    101     this.text = text;
    102     this.nextChar = 0;
    103     this.nextClass = 10;
    104     this.parts = new ArrayList<Part>();
    105     this.tokens = new ArrayList<Token>();
    106     this.formatStart = new HashMap<Character,Format>();
    107     this.parseSmilies = parseSmilies;
    108     this.parseAcronyms = parseAcronyms;
    109     this.parseFormatting = parseFormatting;
    110     this.parseUrls = parseUrls;
    111     this.parseMusic = parseMusic;
    112     this.parseMeText = parseMeText;
    113   }
    115   /** Returns the raw text being parsed. */
    116   public final String getRawText() { return text; }
    118   /** Return the number of parts. */
    119   public final int getPartCount() { return parts.size(); }
    121   /** Return the part at the given index. */
    122   public final Part getPart(int index) { return parts.get(index); }
    124   /** Return the list of parts from the parsed text */
    125   public final List<Part> getParts() { return parts; }
    127   /** Parses the text string into an internal representation. */
    128   public void parse() {
    129     // Look for music track (of which there would be only one and it'll be the
    130     // first token)
    131     if (parseMusicTrack()) {
    132       buildParts(null);
    133       return;
    134     }
    136     // Look for me commands.
    137     String meText = null;
    138     if (parseMeText && text.startsWith("/me") && (text.length() > 3) &&
    139         Character.isWhitespace(text.charAt(3))) {
    140       meText = text.substring(0, 4);
    141       text = text.substring(4);
    142     }
    144     // Break the text into tokens.
    145     boolean wasSmiley = false;
    146     while (nextChar < text.length()) {
    147       if (!isWordBreak(nextChar)) {
    148         if (!wasSmiley || !isSmileyBreak(nextChar)) {
    149           throw new AssertionError("last chunk did not end at word break");
    150         }
    151       }
    153       if (parseSmiley()) {
    154         wasSmiley = true;
    155       } else {
    156         wasSmiley = false;
    158         if (!parseAcronym() && !parseURL() && !parseFormatting()) {
    159           parseText();
    160         }
    161       }
    162     }
    164     // Trim the whitespace before and after media components.
    165     for (int i = 0; i < tokens.size(); ++i) {
    166       if (tokens.get(i).isMedia()) {
    167         if ((i > 0) && (tokens.get(i - 1) instanceof Html)) {
    168           ((Html)tokens.get(i - 1)).trimLeadingWhitespace();
    169         }
    170         if ((i + 1 < tokens.size()) && (tokens.get(i + 1) instanceof Html)) {
    171           ((Html)tokens.get(i + 1)).trimTrailingWhitespace();
    172         }
    173       }
    174     }
    176     // Remove any empty html tokens.
    177     for (int i = 0; i < tokens.size(); ++i) {
    178       if (tokens.get(i).isHtml() &&
    179           (tokens.get(i).toHtml(true).length() == 0)) {
    180         tokens.remove(i);
    181         --i;  // visit this index again
    182       }
    183     }
    185     buildParts(meText);
    186   }
    188   /**
    189    * Get a the appropriate Token for a given URL
    190    *
    191    * @param text the anchor text
    192    * @param url the url
    193    *
    194    */
    195   public static Token tokenForUrl(String url, String text) {
    196     if(url == null) {
    197       return null;
    198     }
    200     //Look for video links
    201     Video video = Video.matchURL(url, text);
    202     if (video != null) {
    203       return video;
    204     }
    206     // Look for video links.
    207     YouTubeVideo ytVideo = YouTubeVideo.matchURL(url, text);
    208     if (ytVideo != null) {
    209       return ytVideo;
    210     }
    212     // Look for photo links.
    213     Photo photo = Photo.matchURL(url, text);
    214     if (photo != null) {
    215       return photo;
    216     }
    218     // Look for photo links.
    219     FlickrPhoto flickrPhoto = FlickrPhoto.matchURL(url, text);
    220     if (flickrPhoto != null) {
    221       return flickrPhoto;
    222     }
    224     //Not media, so must be a regular URL
    225     return new Link(url, text);
    226   }
    228   /**
    229    * Builds the parts list.
    230    *
    231    * @param meText any meText parsed from the message
    232    */
    233   private void buildParts(String meText) {
    234     for (int i = 0; i < tokens.size(); ++i) {
    235       Token token = tokens.get(i);
    236       if (token.isMedia() || (parts.size() == 0) || lastPart().isMedia()) {
    237         parts.add(new Part());
    238       }
    239       lastPart().add(token);
    240     }
    242     // The first part inherits the meText of the line.
    243     if (parts.size() > 0) {
    244       parts.get(0).setMeText(meText);
    245     }
    246   }
    248   /** Returns the last part in the list. */
    249   private Part lastPart() { return parts.get(parts.size() - 1); }
    251   /**
    252    * Looks for a music track (\u266B is first character, everything else is
    253    * track info).
    254    */
    255   private boolean parseMusicTrack() {
    257     if (parseMusic && text.startsWith(musicNote)) {
    258       addToken(new MusicTrack(text.substring(musicNote.length())));
    259       nextChar = text.length();
    260       return true;
    261     }
    262     return false;
    263   }
    265   /** Consumes all of the text in the next word . */
    266   private void parseText() {
    267     StringBuilder buf = new StringBuilder();
    268     int start = nextChar;
    269     do {
    270       char ch = text.charAt(nextChar++);
    271       switch (ch) {
    272         case '<':  buf.append("&lt;"); break;
    273         case '>':  buf.append("&gt;"); break;
    274         case '&':  buf.append("&amp;"); break;
    275         case '"':  buf.append("&quot;"); break;
    276         case '\'':  buf.append("&apos;"); break;
    277         case '\n':  buf.append("<br>"); break;
    278         default:  buf.append(ch); break;
    279       }
    280     } while (!isWordBreak(nextChar));
    282     addToken(new Html(text.substring(start, nextChar), buf.toString()));
    283   }
    285   /**
    286    * Looks for smileys (e.g., ":)") in the text.  The set of known smileys is
    287    * loaded from a file into a trie at server start.
    288    */
    289   private boolean parseSmiley() {
    290     if(!parseSmilies) {
    291       return false;
    292     }
    293     TrieNode match = longestMatch(getResources().getSmileys(), this, nextChar,
    294                                   true);
    295     if (match == null) {
    296       return false;
    297     } else {
    298       int previousCharClass = getCharClass(nextChar - 1);
    299       int nextCharClass = getCharClass(nextChar + match.getText().length());
    300       if ((previousCharClass == 2 || previousCharClass == 3)
    301           && (nextCharClass == 2 || nextCharClass == 3)) {
    302         return false;
    303       }
    304       addToken(new Smiley(match.getText()));
    305       nextChar += match.getText().length();
    306       return true;
    307     }
    308   }
    310   /** Looks for acronyms (e.g., "lol") in the text.
    311    */
    312   private boolean parseAcronym() {
    313     if(!parseAcronyms) {
    314       return false;
    315     }
    316     TrieNode match = longestMatch(getResources().getAcronyms(), this, nextChar);
    317     if (match == null) {
    318       return false;
    319     } else {
    320       addToken(new Acronym(match.getText(), match.getValue()));
    321       nextChar += match.getText().length();
    322       return true;
    323     }
    324   }
    326   /** Determines if this is an allowable domain character. */
    327   private boolean isDomainChar(char c) {
    328     return c == '-' || Character.isLetter(c) || Character.isDigit(c);
    329   }
    331   /** Determines if the given string is a valid domain. */
    332   private boolean isValidDomain(String domain) {
    333     // For hostnames, check that it ends with a known domain suffix
    334     if (matches(getResources().getDomainSuffixes(), reverse(domain))) {
    335       return true;
    336     }
    337     return false;
    338   }
    340   /**
    341    * Looks for a URL in two possible forms:  either a proper URL with a known
    342    * scheme or a domain name optionally followed by a path, query, or query.
    343    */
    344   private boolean parseURL() {
    345     // Make sure this is a valid place to start a URL.
    346     if (!parseUrls || !isURLBreak(nextChar)) {
    347       return false;
    348     }
    350     int start = nextChar;
    352     // Search for the first block of letters.
    353     int index = start;
    354     while ((index < text.length()) && isDomainChar(text.charAt(index))) {
    355       index += 1;
    356     }
    358     String url = "";
    359     boolean done = false;
    361     if (index == text.length()) {
    362       return false;
    363     } else if (text.charAt(index) == ':') {
    364       // Make sure this is a known scheme.
    365       String scheme = text.substring(nextChar, index);
    366       if (!getResources().getSchemes().contains(scheme)) {
    367         return false;
    368       }
    369     } else if (text.charAt(index) == '.') {
    370       // Search for the end of the domain name.
    371       while (index < text.length()) {
    372         char ch = text.charAt(index);
    373         if ((ch != '.') && !isDomainChar(ch)) {
    374           break;
    375         } else {
    376           index += 1;
    377         }
    378       }
    380       // Make sure the domain name has a valid suffix.  Since tries look for
    381       // prefix matches, we reverse all the strings to get suffix comparisons.
    382       String domain = text.substring(nextChar, index);
    383       if (!isValidDomain(domain)) {
    384         return false;
    385       }
    387       // Search for a port.  We deal with this specially because a colon can
    388       // also be a punctuation character.
    389       if ((index + 1 < text.length()) && (text.charAt(index) == ':')) {
    390         char ch = text.charAt(index + 1);
    391         if (Character.isDigit(ch)) {
    392           index += 1;
    393           while ((index < text.length()) &&
    394                  Character.isDigit(text.charAt(index))) {
    395             index += 1;
    396           }
    397         }
    398       }
    400       // The domain name should be followed by end of line, whitespace,
    401       // punctuation, or a colon, slash, question, or hash character.  The
    402       // tricky part here is that some URL characters are also punctuation, so
    403       // we need to distinguish them.  Since we looked for ports above, a colon
    404       // is always punctuation here.  To distinguish '?' cases, we look at the
    405       // character that follows it.
    406       if (index == text.length()) {
    407         done = true;
    408       } else {
    409         char ch = text.charAt(index);
    410         if (ch == '?') {
    411           // If the next character is whitespace or punctuation (or missing),
    412           // then this question mark looks like punctuation.
    413           if (index + 1 == text.length()) {
    414             done = true;
    415           } else {
    416             char ch2 = text.charAt(index + 1);
    417             if (Character.isWhitespace(ch2) || isPunctuation(ch2)) {
    418               done = true;
    419             }
    420           }
    421         } else if (isPunctuation(ch)) {
    422           done = true;
    423         } else if (Character.isWhitespace(ch)) {
    424           done = true;
    425         } else if ((ch == '/') || (ch == '#')) {
    426           // In this case, the URL is not done.  We will search for the end of
    427           // it below.
    428         } else {
    429           return false;
    430         }
    431       }
    433       // We will assume the user meant HTTP.  (One weird case is where they
    434       // type a port of 443.  That could mean HTTPS, but they might also want
    435       // HTTP.  We'll let them specify if they don't want HTTP.)
    436       url = "http://";
    437     } else {
    438       return false;
    439     }
    441     // If the URL is not done, search for the end, which is just before the
    442     // next whitespace character.
    443     if (!done) {
    444       while ((index < text.length()) &&
    445              !Character.isWhitespace(text.charAt(index))) {
    446         index += 1;
    447       }
    448     }
    450     String urlText = text.substring(start, index);
    451     url += urlText;
    453     // Figure out the appropriate token type.
    454     addURLToken(url, urlText);
    456     nextChar = index;
    457     return true;
    458   }
    460   /**
    461    * Adds the appropriate token for the given URL.  This might be a simple
    462    * link or it might be a recognized media type.
    463    */
    464   private void addURLToken(String url, String text) {
    465      addToken(tokenForUrl(url, text));
    466   }
    468   /**
    469    * Deal with formatting characters.
    470    *
    471    * Parsing is as follows:
    472    *  - Treat all contiguous strings of formatting characters as one block.
    473    *    (This method processes one block.)
    474    *  - Only a single instance of a particular format character within a block
    475    *    is used to determine whether to turn on/off that type of formatting;
    476    *    other instances simply print the character itself.
    477    *  - If the format is to be turned on, we use the _first_ instance; if it
    478    *    is to be turned off, we use the _last_ instance (by appending the
    479    *    format.)
    480    *
    481    * Example:
    482    *   **string** turns into <b>*string*</b>
    483    */
    484   private boolean parseFormatting() {
    485     if(!parseFormatting) {
    486       return false;
    487     }
    488     int endChar = nextChar;
    489     while ((endChar < text.length()) && isFormatChar(text.charAt(endChar))) {
    490       endChar += 1;
    491     }
    493     if ((endChar == nextChar) || !isWordBreak(endChar)) {
    494       return false;
    495     }
    497     // Keeps track of whether we've seen a character (in map if we've seen it)
    498     // and whether we should append a closing format token (if value in
    499     // map is TRUE).  Linked hashmap for consistent ordering.
    500     LinkedHashMap<Character, Boolean> seenCharacters =
    501         new LinkedHashMap<Character, Boolean>();
    503     for (int index = nextChar; index < endChar; ++index) {
    504       char ch = text.charAt(index);
    505       Character key = Character.valueOf(ch);
    506       if (seenCharacters.containsKey(key)) {
    507         // Already seen this character, just append an unmatched token, which
    508         // will print plaintext character
    509         addToken(new Format(ch, false));
    510       } else {
    511         Format start = formatStart.get(key);
    512         if (start != null) {
    513           // Match the start token, and ask an end token to be appended
    514           start.setMatched(true);
    515           formatStart.remove(key);
    516           seenCharacters.put(key, Boolean.TRUE);
    517         } else {
    518           // Append start token
    519           start = new Format(ch, true);
    520           formatStart.put(key, start);
    521           addToken(start);
    522           seenCharacters.put(key, Boolean.FALSE);
    523         }
    524       }
    525     }
    527     // Append any necessary end tokens
    528     for (Character key : seenCharacters.keySet()) {
    529       if (seenCharacters.get(key) == Boolean.TRUE) {
    530         Format end = new Format(key.charValue(), false);
    531         end.setMatched(true);
    532         addToken(end);
    533       }
    534     }
    536     nextChar = endChar;
    537     return true;
    538   }
    540   /** Determines whether the given index could be a possible word break. */
    541   private boolean isWordBreak(int index) {
    542     return getCharClass(index - 1) != getCharClass(index);
    543   }
    545   /** Determines whether the given index could be a possible smiley break. */
    546   private boolean isSmileyBreak(int index) {
    547     if (index > 0 && index < text.length()) {
    548       if (isSmileyBreak(text.charAt(index - 1), text.charAt(index))) {
    549         return true;
    550       }
    551     }
    553     return false;
    554   }
    556   /**
    557    * Verifies that the character before the given index is end of line,
    558    * whitespace, or punctuation.
    559    */
    560   private boolean isURLBreak(int index) {
    561     switch (getCharClass(index - 1)) {
    562       case 2:
    563       case 3:
    564       case 4:
    565         return false;
    567       case 0:
    568       case 1:
    569       default:
    570         return true;
    571     }
    572   }
    574   /** Returns the class for the character at the given index. */
    575   private int getCharClass(int index) {
    576     if ((index < 0) || (text.length() <= index)) {
    577       return 0;
    578     }
    580     char ch = text.charAt(index);
    581     if (Character.isWhitespace(ch)) {
    582       return 1;
    583     } else if (Character.isLetter(ch)) {
    584       return 2;
    585     } else if (Character.isDigit(ch)) {
    586       return 3;
    587     } else if (isPunctuation(ch)) {
    588       // For punctuation, we return a unique value every time so that they are
    589       // always different from any other character.  Punctuation should always
    590       // be considered a possible word break.
    591       return ++nextClass;
    592     } else {
    593       return 4;
    594     }
    595   }
    597   /**
    598    * Returns true if <code>c1</code> could be the last character of
    599    * a smiley and <code>c2</code> could be the first character of
    600    * a different smiley, if {@link #isWordBreak} would not already
    601    * recognize that this is possible.
    602    */
    603   private static boolean isSmileyBreak(char c1, char c2) {
    604     switch (c1) {
    605       /*
    606        * These characters can end smileys, but don't normally end words.
    607        */
    608       case '$': case '&': case '*': case '+': case '-':
    609       case '/': case '<': case '=': case '>': case '@':
    610       case '[': case '\\': case ']': case '^': case '|':
    611       case '}': case '~':
    612         switch (c2) {
    613           /*
    614            * These characters can begin smileys, but don't normally
    615            * begin words.
    616            */
    617           case '#': case '$': case '%': case '*': case '/':
    618           case '<': case '=': case '>': case '@': case '[':
    619           case '\\': case '^': case '~':
    620             return true;
    621         }
    622     }
    624     return false;
    625   }
    627   /** Determines whether the given character is punctuation. */
    628   private static boolean isPunctuation(char ch) {
    629     switch (ch) {
    630       case '.': case ',': case '"': case ':': case ';':
    631       case '?': case '!': case '(': case ')':
    632         return true;
    634       default:
    635         return false;
    636     }
    637   }
    639   /**
    640    * Determines whether the given character is the beginning or end of a
    641    * section with special formatting.
    642    */
    643   private static boolean isFormatChar(char ch) {
    644     switch (ch) {
    645       case '*': case '_': case '^':
    646         return true;
    648       default:
    649         return false;
    650     }
    651   }
    653   /** Represents a unit of parsed output. */
    654   public static abstract class Token {
    655     public enum Type {
    657       HTML ("html"),
    658       FORMAT ("format"),  // subtype of HTML
    659       LINK ("l"),
    660       SMILEY ("e"),
    661       ACRONYM ("a"),
    662       MUSIC ("m"),
    663       GOOGLE_VIDEO ("v"),
    664       YOUTUBE_VIDEO ("yt"),
    665       PHOTO ("p"),
    666       FLICKR ("f");
    668       //stringreps for HTML and FORMAT don't really matter
    669       //because they don't define getInfo(), which is where it is used
    670       //For the other types, code depends on their stringreps
    671       private String stringRep;
    673       Type(String stringRep) {
    674         this.stringRep = stringRep;
    675       }
    677       /** {@inheritDoc} */
    678       public String toString() {
    679         return this.stringRep;
    680       }
    681     }
    683     protected Type type;
    684     protected String text;
    686     protected Token(Type type, String text) {
    687       this.type = type;
    688       this.text = text;
    689     }
    691     /** Returns the type of the token. */
    692     public Type getType() { return type; }
    694     /**
    695      * Get the relevant information about a token
    696      *
    697      * @return a list of strings representing the token, not null
    698      *         The first item is always a string representation of the type
    699      */
    700     public List<String> getInfo() {
    701       List<String> info = new ArrayList<String>();
    702       info.add(getType().toString());
    703       return info;
    704     }
    706     /** Returns the raw text of the token. */
    707     public String getRawText() { return text; }
    709     public boolean isMedia() { return false; }
    710     public abstract boolean isHtml();
    711     public boolean isArray() { return !isHtml(); }
    713     public String toHtml(boolean caps) { throw new AssertionError("not html"); }
    715     // The token can change the caps of the text after that point.
    716     public boolean controlCaps() { return false; }
    717     public boolean setCaps() { return false; }
    718   }
    720   /** Represents a simple string of html text. */
    721   public static class Html extends Token {
    722     private String html;
    724     public Html(String text, String html) {
    725       super(Type.HTML, text);
    726       this.html = html;
    727     }
    729     public boolean isHtml() { return true; }
    730     public String toHtml(boolean caps) {
    731       return caps ? html.toUpperCase() : html;
    732     }
    733     /**
    734      * Not supported. Info should not be needed for this type
    735      */
    736     public List<String> getInfo() {
    737       throw new UnsupportedOperationException();
    738     }
    740     public void trimLeadingWhitespace() {
    741       text = trimLeadingWhitespace(text);
    742       html = trimLeadingWhitespace(html);
    743     }
    745     public void trimTrailingWhitespace() {
    746       text = trimTrailingWhitespace(text);
    747       html = trimTrailingWhitespace(html);
    748     }
    750     private static String trimLeadingWhitespace(String text) {
    751       int index = 0;
    752       while ((index < text.length()) &&
    753              Character.isWhitespace(text.charAt(index))) {
    754         ++index;
    755       }
    756       return text.substring(index);
    757     }
    759     public static String trimTrailingWhitespace(String text) {
    760       int index = text.length();
    761       while ((index > 0) && Character.isWhitespace(text.charAt(index - 1))) {
    762         --index;
    763       }
    764       return text.substring(0, index);
    765     }
    766   }
    768   /** Represents a music track token at the beginning. */
    769   public static class MusicTrack extends Token {
    770     private String track;
    772     public MusicTrack(String track) {
    773       super(Type.MUSIC, track);
    774       this.track = track;
    775     }
    777     public String getTrack() { return track; }
    779     public boolean isHtml() { return false; }
    781     public List<String> getInfo() {
    782       List<String> info = super.getInfo();
    783       info.add(getTrack());
    784       return info;
    785     }
    786   }
    788   /** Represents a link that was found in the input. */
    789   public static class Link extends Token {
    790     private String url;
    792     public Link(String url, String text) {
    793       super(Type.LINK, text);
    794       this.url = url;
    795     }
    797     public String getURL() { return url; }
    799     public boolean isHtml() { return false; }
    801     public List<String> getInfo() {
    802       List<String> info = super.getInfo();
    803       info.add(getURL());
    804       info.add(getRawText());
    805       return info;
    806     }
    807   }
    809   /** Represents a link to a Google Video. */
    810   public static class Video extends Token {
    811     /** Pattern for a video URL. */
    812     private static final Pattern URL_PATTERN = Pattern.compile(
    813         "(?i)http://video\\.google\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/videoplay\\?"
    814         + ".*?\\bdocid=(-?\\d+).*");
    816     private String docid;
    818     public Video(String docid, String text) {
    819       super(Type.GOOGLE_VIDEO, text);
    820       this.docid = docid;
    821     }
    823     public String getDocID() { return docid; }
    825     public boolean isHtml() { return false; }
    826     public boolean isMedia() { return true; }
    828     /** Returns a Video object if the given url is to a video. */
    829     public static Video matchURL(String url, String text) {
    830       Matcher m = URL_PATTERN.matcher(url);
    831       if (m.matches()) {
    832         return new Video(m.group(1), text);
    833       } else {
    834         return null;
    835       }
    836     }
    838     public List<String> getInfo() {
    839       List<String> info = super.getInfo();
    840       info.add(getRssUrl(docid));
    841       info.add(getURL(docid));
    842       return info;
    843     }
    845     /** Returns the URL for the RSS description of the given video. */
    846     public static String getRssUrl(String docid) {
    847       return "http://video.google.com/videofeed"
    848              + "?type=docid&output=rss&sourceid=gtalk&docid=" + docid;
    849     }
    851     /** (For testing purposes:) Returns a video URL with the given parts.  */
    852     public static String getURL(String docid) {
    853       return getURL(docid, null);
    854     }
    856     /** (For testing purposes:) Returns a video URL with the given parts.  */
    857     public static String getURL(String docid, String extraParams) {
    858       if (extraParams == null) {
    859         extraParams = "";
    860       } else if (extraParams.length() > 0) {
    861         extraParams += "&";
    862       }
    863       return "http://video.google.com/videoplay?" + extraParams
    864              + "docid=" + docid;
    865     }
    866   }
    868   /** Represents a link to a YouTube video. */
    869   public static class YouTubeVideo extends Token {
    870     /** Pattern for a video URL. */
    871     private static final Pattern URL_PATTERN = Pattern.compile(
    872         "(?i)http://(?:[a-z0-9]+\\.)?youtube\\.[a-z0-9]+(?:\\.[a-z0-9]+)?/watch\\?"
    873         + ".*\\bv=([-_a-zA-Z0-9=]+).*");
    875     private String docid;
    877     public YouTubeVideo(String docid, String text) {
    878       super(Type.YOUTUBE_VIDEO, text);
    879       this.docid = docid;
    880     }
    882     public String getDocID() { return docid; }
    884     public boolean isHtml() { return false; }
    885     public boolean isMedia() { return true; }
    887     /** Returns a Video object if the given url is to a video. */
    888     public static YouTubeVideo matchURL(String url, String text) {
    889       Matcher m = URL_PATTERN.matcher(url);
    890       if (m.matches()) {
    891         return new YouTubeVideo(m.group(1), text);
    892       } else {
    893         return null;
    894       }
    895     }
    897     public List<String> getInfo() {
    898       List<String> info = super.getInfo();
    899       info.add(getRssUrl(docid));
    900       info.add(getURL(docid));
    901       return info;
    902     }
    904     /** Returns the URL for the RSS description of the given video. */
    905     public static String getRssUrl(String docid) {
    906       return "http://youtube.com/watch?v=" + docid;
    907     }
    909     /** (For testing purposes:) Returns a video URL with the given parts.  */
    910     public static String getURL(String docid) {
    911       return getURL(docid, null);
    912     }
    914     /** (For testing purposes:) Returns a video URL with the given parts.  */
    915     public static String getURL(String docid, String extraParams) {
    916       if (extraParams == null) {
    917         extraParams = "";
    918       } else if (extraParams.length() > 0) {
    919         extraParams += "&";
    920       }
    921       return "http://youtube.com/watch?" + extraParams + "v=" + docid;
    922     }
    924     /** (For testing purposes:) Returns a video URL with the given parts.
    925       * @param http If true, includes http://
    926       * @param prefix If non-null/non-blank, adds to URL before youtube.com.
    927       *   (e.g., prefix="br." --> "br.youtube.com")
    928       */
    929     public static String getPrefixedURL(boolean http, String prefix,
    930                                         String docid, String extraParams) {
    931       String protocol = "";
    933       if (http) {
    934         protocol = "http://";
    935       }
    937       if (prefix == null) {
    938         prefix = "";
    939       }
    941       if (extraParams == null) {
    942         extraParams = "";
    943       } else if (extraParams.length() > 0) {
    944         extraParams += "&";
    945       }
    947       return protocol + prefix + "youtube.com/watch?" + extraParams + "v=" +
    948               docid;
    949     }
    950   }
    952   /** Represents a link to a Picasa photo or album. */
    953   public static class Photo extends Token {
    954     /** Pattern for an album or photo URL. */
    955     // TODO (katyarogers) searchbrowse includes search lists and tags,
    956     // it follows a different pattern than albums - would be nice to add later
    957     private static final Pattern URL_PATTERN = Pattern.compile(
    958         "http://picasaweb.google.com/([^/?#&]+)/+((?!searchbrowse)[^/?#&]+)(?:/|/photo)?(?:\\?[^#]*)?(?:#(.*))?");
    960     private String user;
    961     private String album;
    962     private String photo;  // null for albums
    964     public Photo(String user, String album, String photo, String text) {
    965       super(Type.PHOTO, text);
    966       this.user = user;
    967       this.album = album;
    968       this.photo = photo;
    969     }
    971     public String getUser() { return user; }
    972     public String getAlbum() { return album; }
    973     public String getPhoto() { return photo; }
    975     public boolean isHtml() { return false; }
    976     public boolean isMedia() { return true; }
    978     /** Returns a Photo object if the given url is to a photo or album. */
    979     public static Photo matchURL(String url, String text) {
    980       Matcher m = URL_PATTERN.matcher(url);
    981       if (m.matches()) {
    982         return new Photo(m.group(1), m.group(2), m.group(3), text);
    983       } else {
    984         return null;
    985       }
    986     }
    988     public List<String> getInfo() {
    989       List<String> info = super.getInfo();
    990       info.add(getRssUrl(getUser()));
    991       info.add(getAlbumURL(getUser(), getAlbum()));
    992       if (getPhoto() != null) {
    993         info.add(getPhotoURL(getUser(), getAlbum(), getPhoto()));
    994       } else {
    995         info.add((String)null);
    996       }
    997       return info;
    998     }
   1000     /** Returns the URL for the RSS description of the user's albums. */
   1001     public static String getRssUrl(String user) {
   1002       return "http://picasaweb.google.com/data/feed/api/user/" + user +
   1003         "?category=album&alt=rss";
   1004     }
   1006     /** Returns the URL for an album. */
   1007     public static String getAlbumURL(String user, String album) {
   1008       return "http://picasaweb.google.com/" + user + "/" + album;
   1009     }
   1011     /** Returns the URL for a particular photo. */
   1012     public static String getPhotoURL(String user, String album, String photo) {
   1013       return "http://picasaweb.google.com/" + user + "/" + album + "/photo#"
   1014              + photo;
   1015     }
   1016   }
   1018   /** Represents a link to a Flickr photo or album. */
   1019   public static class FlickrPhoto extends Token {
   1020     /** Pattern for a user album or photo URL. */
   1021     private static final Pattern URL_PATTERN = Pattern.compile(
   1022         "http://(?:www.)?flickr.com/photos/([^/?#&]+)/?([^/?#&]+)?/?.*");
   1023     private static final Pattern GROUPING_PATTERN = Pattern.compile(
   1024         "http://(?:www.)?flickr.com/photos/([^/?#&]+)/(tags|sets)/" +
   1025         "([^/?#&]+)/?");
   1027     private static final String SETS = "sets";
   1028     private static final String TAGS = "tags";
   1030     private String user;
   1031     private String photo;      // null for user album
   1032     private String grouping;   // either "tags" or "sets"
   1033     private String groupingId; // sets or tags identifier
   1035     public FlickrPhoto(String user, String photo, String grouping,
   1036                        String groupingId, String text) {
   1037       super(Type.FLICKR, text);
   1039       /* System wide tags look like the URL to a Flickr user. */
   1040       if (!TAGS.equals(user)) {
   1041         this.user = user;
   1042         // Don't consider slide show URL a photo
   1043         this.photo = (!"show".equals(photo) ? photo : null);
   1044         this.grouping = grouping;
   1045         this.groupingId = groupingId;
   1046       } else {
   1047         this.user = null;
   1048         this.photo = null;
   1049         this.grouping = TAGS;
   1050         this.groupingId = photo;
   1051       }
   1052     }
   1054     public String getUser() { return user; }
   1055     public String getPhoto() { return photo; }
   1056     public String getGrouping() { return grouping; }
   1057     public String getGroupingId() { return groupingId; }
   1059     public boolean isHtml() { return false; }
   1060     public boolean isMedia() { return true; }
   1062     /**
   1063      * Returns a FlickrPhoto object if the given url is to a photo or Flickr
   1064      * user.
   1065      */
   1066     public static FlickrPhoto matchURL(String url, String text) {
   1067       Matcher m = GROUPING_PATTERN.matcher(url);
   1068       if (m.matches()) {
   1069         return new FlickrPhoto(m.group(1), null, m.group(2), m.group(3), text);
   1070       }
   1072       m = URL_PATTERN.matcher(url);
   1073       if (m.matches()) {
   1074         return new FlickrPhoto(m.group(1), m.group(2), null, null, text);
   1075       } else {
   1076         return null;
   1077       }
   1078     }
   1080     public List<String> getInfo() {
   1081       List<String> info = super.getInfo();
   1082       info.add(getUrl());
   1083       info.add(getUser() != null ? getUser() : "");
   1084       info.add(getPhoto() != null ? getPhoto() : "");
   1085       info.add(getGrouping() != null ? getGrouping() : "");
   1086       info.add(getGroupingId() != null ? getGroupingId() : "");
   1087       return info;
   1088     }
   1090     public String getUrl() {
   1091       if (SETS.equals(grouping)) {
   1092         return getUserSetsURL(user, groupingId);
   1093       } else if (TAGS.equals(grouping)) {
   1094         if (user != null) {
   1095           return getUserTagsURL(user, groupingId);
   1096         } else {
   1097           return getTagsURL(groupingId);
   1098         }
   1099       } else if (photo != null) {
   1100         return getPhotoURL(user, photo);
   1101       } else {
   1102         return getUserURL(user);
   1103       }
   1104     }
   1106     /** Returns the URL for the RSS description. */
   1107     public static String getRssUrl(String user) {
   1108       return null;
   1109     }
   1111     /** Returns the URL for a particular tag. */
   1112     public static String getTagsURL(String tag) {
   1113       return "http://flickr.com/photos/tags/" + tag;
   1114     }
   1116     /** Returns the URL to the user's Flickr homepage. */
   1117     public static String getUserURL(String user) {
   1118       return "http://flickr.com/photos/" + user;
   1119     }
   1121     /** Returns the URL for a particular photo. */
   1122     public static String getPhotoURL(String user, String photo) {
   1123       return "http://flickr.com/photos/" + user + "/" + photo;
   1124     }
   1126     /** Returns the URL for a user tag photo set. */
   1127     public static String getUserTagsURL(String user, String tagId) {
   1128       return "http://flickr.com/photos/" + user + "/tags/" + tagId;
   1129     }
   1131     /** Returns the URL for user set. */
   1132     public static String getUserSetsURL(String user, String setId) {
   1133       return "http://flickr.com/photos/" + user + "/sets/" + setId;
   1134     }
   1135   }
   1137   /** Represents a smiley that was found in the input. */
   1138   public static class Smiley extends Token {
   1139     // TODO: Pass the SWF URL down to the client.
   1141     public Smiley(String text) {
   1142       super(Type.SMILEY, text);
   1143     }
   1145     public boolean isHtml() { return false; }
   1147     public List<String> getInfo() {
   1148       List<String> info = super.getInfo();
   1149       info.add(getRawText());
   1150       return info;
   1151     }
   1152   }
   1154   /** Represents an acronym that was found in the input. */
   1155   public static class Acronym extends Token {
   1156     private String value;
   1157     // TODO: SWF
   1159     public Acronym(String text, String value) {
   1160       super(Type.ACRONYM, text);
   1161       this.value = value;
   1162     }
   1164     public String getValue() { return value; }
   1166     public boolean isHtml() { return false; }
   1168     public List<String> getInfo() {
   1169       List<String> info = super.getInfo();
   1170       info.add(getRawText());
   1171       info.add(getValue());
   1172       return info;
   1173     }
   1174   }
   1176   /** Represents a character that changes formatting. */
   1177   public static class Format extends Token {
   1178     private char ch;
   1179     private boolean start;
   1180     private boolean matched;
   1182     public Format(char ch, boolean start) {
   1183       super(Type.FORMAT, String.valueOf(ch));
   1184       this.ch = ch;
   1185       this.start = start;
   1186     }
   1188     public void setMatched(boolean matched) { this.matched = matched; }
   1190     public boolean isHtml() { return true; }
   1192     public String toHtml(boolean caps) {
   1193       // This character only implies special formatting if it was matched.
   1194       // Otherwise, it was just a plain old character.
   1195       if (matched) {
   1196         return start ? getFormatStart(ch) : getFormatEnd(ch);
   1197       } else {
   1198         // We have to make sure we escape HTML characters as usual.
   1199         return (ch == '"') ? "&quot;" : String.valueOf(ch);
   1200       }
   1201     }
   1203     /**
   1204      * Not supported. Info should not be needed for this type
   1205      */
   1206     public List<String> getInfo() {
   1207       throw new UnsupportedOperationException();
   1208     }
   1210     public boolean controlCaps() { return (ch == '^'); }
   1211     public boolean setCaps() { return start; }
   1213     private String getFormatStart(char ch) {
   1214       switch (ch) {
   1215         case '*': return "<b>";
   1216         case '_': return "<i>";
   1217         case '^': return "<b><font color=\"#005FFF\">"; // TODO: all caps
   1218         case '"': return "<font color=\"#999999\">\u201c";
   1219         default: throw new AssertionError("unknown format '" + ch + "'");
   1220       }
   1221     }
   1223     private String getFormatEnd(char ch) {
   1224       switch (ch) {
   1225         case '*': return "</b>";
   1226         case '_': return "</i>";
   1227         case '^': return "</font></b>"; // TODO: all caps
   1228         case '"': return "\u201d</font>";
   1229         default: throw new AssertionError("unknown format '" + ch + "'");
   1230       }
   1231     }
   1232   }
   1234   /** Adds the given token to the parsed output. */
   1235   private void addToken(Token token) {
   1236     tokens.add(token);
   1237   }
   1239   /** Converts the entire message into a single HTML display string. */
   1240   public String toHtml() {
   1241     StringBuilder html = new StringBuilder();
   1243     for (Part part : parts) {
   1244       boolean caps = false;
   1246       html.append("<p>");
   1247       for (Token token : part.getTokens()) {
   1248         if (token.isHtml()) {
   1249           html.append(token.toHtml(caps));
   1250         } else {
   1251           switch (token.getType()) {
   1252           case LINK:
   1253             html.append("<a href=\"");
   1254             html.append(((Link)token).getURL());
   1255             html.append("\">");
   1256             html.append(token.getRawText());
   1257             html.append("</a>");
   1258             break;
   1260           case SMILEY:
   1261             // TODO: link to an appropriate image
   1262             html.append(token.getRawText());
   1263             break;
   1265           case ACRONYM:
   1266             html.append(token.getRawText());
   1267             break;
   1269           case MUSIC:
   1270             // TODO: include a music glyph
   1271             html.append(((MusicTrack)token).getTrack());
   1272             break;
   1274           case GOOGLE_VIDEO:
   1275             // TODO: include a Google Video icon
   1276             html.append("<a href=\"");
   1277             html.append(((Video)token).getURL(((Video)token).getDocID()));
   1278             html.append("\">");
   1279             html.append(token.getRawText());
   1280             html.append("</a>");
   1281             break;
   1283           case YOUTUBE_VIDEO:
   1284             // TODO: include a YouTube icon
   1285             html.append("<a href=\"");
   1286             html.append(((YouTubeVideo)token).getURL(
   1287                 ((YouTubeVideo)token).getDocID()));
   1288             html.append("\">");
   1289             html.append(token.getRawText());
   1290             html.append("</a>");
   1291             break;
   1293           case PHOTO: {
   1294             // TODO: include a Picasa Web icon
   1295             html.append("<a href=\"");
   1296             html.append(Photo.getAlbumURL(
   1297                 ((Photo)token).getUser(), ((Photo)token).getAlbum()));
   1298             html.append("\">");
   1299             html.append(token.getRawText());
   1300             html.append("</a>");
   1301             break;
   1302           }
   1304           case FLICKR:
   1305             // TODO: include a Flickr icon
   1306             Photo p = (Photo) token;
   1307             html.append("<a href=\"");
   1308             html.append(((FlickrPhoto)token).getUrl());
   1309             html.append("\">");
   1310             html.append(token.getRawText());
   1311             html.append("</a>");
   1312             break;
   1314           default:
   1315             throw new AssertionError("unknown token type: " + token.getType());
   1316           }
   1317         }
   1319         if (token.controlCaps()) {
   1320           caps = token.setCaps();
   1321         }
   1322       }
   1323       html.append("</p>\n");
   1324     }
   1326     return html.toString();
   1327   }
   1329   /** Returns the reverse of the given string. */
   1330   protected static String reverse(String str) {
   1331     StringBuilder buf = new StringBuilder();
   1332     for (int i = str.length() - 1; i >= 0; --i) {
   1333       buf.append(str.charAt(i));
   1334     }
   1335     return buf.toString();
   1336   }
   1338   public static class TrieNode {
   1339     private final HashMap<Character,TrieNode> children =
   1340         new HashMap<Character,TrieNode>();
   1341     private String text;
   1342     private String value;
   1344     public TrieNode() { this(""); }
   1345     public TrieNode(String text) {
   1346       this.text = text;
   1347     }
   1349     public final boolean exists() { return value != null; }
   1350     public final String getText() { return text; }
   1351     public final String getValue() { return value; }
   1352     public void setValue(String value) { this.value = value; }
   1354     public TrieNode getChild(char ch) {
   1355       return children.get(Character.valueOf(ch));
   1356     }
   1358     public TrieNode getOrCreateChild(char ch) {
   1359       Character key = Character.valueOf(ch);
   1360       TrieNode node = children.get(key);
   1361       if (node == null) {
   1362         node = new TrieNode(text + String.valueOf(ch));
   1363         children.put(key, node);
   1364       }
   1365       return node;
   1366     }
   1368     /** Adds the given string into the trie. */
   1369     public static  void addToTrie(TrieNode root, String str, String value) {
   1370       int index = 0;
   1371       while (index < str.length()) {
   1372         root = root.getOrCreateChild(str.charAt(index++));
   1373       }
   1374       root.setValue(value);
   1375     }
   1376   }
   1380   /** Determines whether the given string is in the given trie. */
   1381   private static boolean matches(TrieNode root, String str) {
   1382     int index = 0;
   1383     while (index < str.length()) {
   1384       root = root.getChild(str.charAt(index++));
   1385       if (root == null) {
   1386         break;
   1387       } else if (root.exists()) {
   1388         return true;
   1389       }
   1390     }
   1391     return false;
   1392   }
   1394   /**
   1395    * Returns the longest substring of the given string, starting at the given
   1396    * index, that exists in the trie.
   1397    */
   1398   private static TrieNode longestMatch(
   1399       TrieNode root, AbstractMessageParser p, int start) {
   1400     return longestMatch(root, p, start, false);
   1401   }
   1403   /**
   1404    * Returns the longest substring of the given string, starting at the given
   1405    * index, that exists in the trie, with a special tokenizing case for
   1406    * smileys if specified.
   1407    */
   1408   private static TrieNode longestMatch(
   1409       TrieNode root, AbstractMessageParser p, int start, boolean smiley) {
   1410     int index = start;
   1411     TrieNode bestMatch = null;
   1412     while (index < p.getRawText().length()) {
   1413       root = root.getChild(p.getRawText().charAt(index++));
   1414       if (root == null) {
   1415         break;
   1416       } else if (root.exists()) {
   1417         if (p.isWordBreak(index)) {
   1418           bestMatch = root;
   1419         } else if (smiley && p.isSmileyBreak(index)) {
   1420           bestMatch = root;
   1421         }
   1422       }
   1423     }
   1424     return bestMatch;
   1425   }
   1428   /** Represents set of tokens that are delivered as a single message. */
   1429   public static class Part {
   1430     private String meText;
   1431     private ArrayList<Token> tokens;
   1433     public Part() {
   1434       this.tokens = new ArrayList<Token>();
   1435     }
   1437     public String getType(boolean isSend) {
   1438       return (isSend ? "s" : "r") + getPartType();
   1439     }
   1441     private String getPartType() {
   1442       if (isMedia()) {
   1443         return "d";
   1444       } else if (meText != null) {
   1445         return "m";
   1446       } else {
   1447         return "";
   1448       }
   1449     }
   1451     public boolean isMedia() {
   1452       return (tokens.size() == 1) && tokens.get(0).isMedia();
   1453     }
   1454     /**
   1455      * Convenience method for getting the Token of a Part that represents
   1456      * a media Token. Parts of this kind will always only have a single Token
   1457      *
   1458      * @return if this.isMedia(),
   1459      *         returns the Token representing the media contained in this Part,
   1460      *         otherwise returns null;
   1461      */
   1462     public Token getMediaToken() {
   1463       if(isMedia()) {
   1464         return tokens.get(0);
   1465       }
   1466       return null;
   1467     }
   1469     /** Adds the given token to this part. */
   1470     public void add(Token token) {
   1471       if (isMedia()) {
   1472         throw new AssertionError("media ");
   1473       }
   1474        tokens.add(token);
   1475     }
   1477     public void setMeText(String meText) {
   1478       this.meText = meText;
   1479     }
   1481     /** Returns the original text of this part. */
   1482     public String getRawText() {
   1483       StringBuilder buf = new StringBuilder();
   1484       if (meText != null) {
   1485         buf.append(meText);
   1486       }
   1487       for (int i = 0; i < tokens.size(); ++i) {
   1488         buf.append(tokens.get(i).getRawText());
   1489       }
   1490       return buf.toString();
   1491     }
   1493     /** Returns the tokens in this part. */
   1494     public ArrayList<Token> getTokens() { return tokens; }
   1496     /** Adds the tokens into the given builder as an array. */
   1497 //    public void toArray(JSArrayBuilder array) {
   1498 //      if (isMedia()) {
   1499 //        // For media, we send its array (i.e., we don't wrap this in another
   1500 //        // array as we do for non-media parts).
   1501 //        tokens.get(0).toArray(array);
   1502 //      } else {
   1503 //        array.beginArray();
   1504 //        addToArray(array);
   1505 //        array.endArray();
   1506 //      }
   1507 //    }
   1508   }
   1509 }